diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000000..4427955573 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "pyright-lsp@claude-plugins-official": true + } +} diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000000..77ce73ec00 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,51 @@ +{ + "permissions": { + "allow": [ + "Read(**)", + "Glob(**)", + "Grep(**)", + "Bash(python -m archivebox:*)", + "Bash(ls:*)", + "Bash(xargs:*)", + "Bash(python -c:*)", + "Bash(printf:*)", + "Bash(pkill:*)", + "Bash(python3:*)", + "Bash(sqlite3:*)", + "WebFetch(domain:github.com)", + "Bash(uv add:*)", + "Bash(mkdir:*)", + "Bash(chmod:*)", + "Bash(python -m forum_dl:*)", + "Bash(archivebox manage migrate:*)", + "Bash(cat:*)", + "Bash(python archivebox/plugins/pip/on_Dependency__install_using_pip_provider.py:*)", + "Bash(forum-dl:*)", + "Bash(pip uninstall:*)", + "Bash(python:*)", + "Bash(source .venv/bin/activate)", + "Bash(mv:*)", + "Bash(echo:*)", + "Bash(grep:*)", + "WebFetch(domain:python-statemachine.readthedocs.io)", + "Bash(./bin/run_plugin_tests.sh:*)", + "Bash(done)", + "Bash(coverage erase:*)", + "Bash(gh api:*)" + ] + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi", + "statusMessage": "Checking working directory..." + } + ] + } + ] + } +} diff --git a/.dockerignore b/.dockerignore index 8cebf35e62..fac517b42d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,17 +5,41 @@ __pycache__/ .mypy_cache/ .pytest_cache/ .github/ +.pdm-build/ +.pdm-python +.eggs/ +.git/ +.vscode/ +!.git/HEAD +!.git/refs/heads/* venv/ .venv/ +.venv-old/ +.docker_venv/ .docker-venv/ +node_modules/ +chrome/ +chromeprofile/ +chrome_profile/ +pdm.dev.lock +pdm.lock + +docs/ build/ dist/ -pip_dist/ -!pip_dist/archivebox.egg-info/requires.txt brew_dist/ +deb_dist/ +pip_dist/ assets/ +docker/ +website/ +typings/ +tmp/ data/ +data*/ output/ +index.sqlite3 +index.sqlite3-wal diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 01af646deb..0000000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391 -select = F,E9,W -max-line-length = 130 -max-complexity = 10 -exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv diff --git a/.github/.readthedocs.yaml b/.github/.readthedocs.yaml new file mode 100644 index 0000000000..2cefab193a --- /dev/null +++ b/.github/.readthedocs.yaml @@ -0,0 +1,26 @@ +# Read the Docs config for https://docs.archivebox.io +# https://docs.readthedocs.io/en/stable/config-file/v2.html + +version: 2 + +submodules: + include: all + recursive: true + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + #nodejs: "20" # not needed unless we need the full archivebox to run while building docs for some reason + +sphinx: + configuration: docs/conf.py + +formats: + - pdf + - epub + +# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index ff0edb0f18..72dea7c5a8 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,3 +1,2 @@ -github: pirate -patreon: theSquashSH -custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] +github: ["ArchiveBox", "pirate"] +custom: ["https://donate.archivebox.io", "https://swag.archivebox.io"] diff --git a/.github/ISSUE_TEMPLATE/1-bug_report.yml b/.github/ISSUE_TEMPLATE/1-bug_report.yml new file mode 100644 index 0000000000..40d9b2d02d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1-bug_report.yml @@ -0,0 +1,198 @@ +name: 🐞 Bug report +description: Report a bug or error you encountered in ArchiveBox +title: "Bug: ..." +assignees: + - pirate +type: 'Bug' +body: + - type: markdown + attributes: + value: | + *Please note:* it is normal to see errors occasionally for some extractors on some URLs (not every extractor will work on every type of page). + Please report archiving errors if you are seeing them *consistently across many URLs* or if they are *preventing you from using ArchiveBox*. + + - type: textarea + id: description + attributes: + label: Provide a screenshot and describe the bug + description: | + Attach a screenshot and describe what the issue is, what you expected to happen, and if relevant, the *URLs you were trying to archive*. + placeholder: | + Got a bunch of 'singlefile was unable to archive this page' errors when trying to archive URLs from this site: https://example.com/xyz ... + I also tried to archive the same URLs using `singlefile` directly and some of them worked but not all of them. etc. ... + validations: + required: true + + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to reproduce + description: Please provide the exact steps you took to trigger the issue (including any shell commands run, URLs visited, buttons clicked, etc.). + render: markdown + placeholder: | + 1. Started ArchiveBox by running: `docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox` in iTerm2 + 2. Went to the https://127.0.0.1:8000/add/ page in Google Chrome + 3. Typed 'https://example.com/xyz' into the 'Add URL' input field + 4. Clicked the 'Add+' button + 5. Got a 500 error and saw the errors below in terminal + validations: + required: true + + - type: textarea + id: logs + attributes: + label: Logs or errors + description: "Paste any terminal output, logs, or errors (check `data/logs/errors.log` as well)." + placeholder: | + ╭─────────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ [2024-11-02 19:54:28] ArchiveBox v0.8.6rc0: archivebox add https://example.com#1234567 │ + ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + [+] [2024-11-02 19:54:29] Adding 1 links to index (crawl depth=0)... + > Saved verbatim input to sources/1730577269-import.txt + > Parsed 1 URLs from input (Generic TXT) + ... + render: shell + validations: + required: false + + - type: textarea + id: version + attributes: + label: ArchiveBox Version + description: | + **REQUIRED:** Run the `archivebox version` command inside your collection dir and paste the *full output* here (*not just the version number*). + For Docker Compose run: `docker compose run archivebox version` + For plain Docker run: `docker run -v $PWD:/data archivebox/archivebox version` + render: shell + placeholder: | + 0.8.6 + ArchiveBox v0.8.6rc0 COMMIT_HASH=721427a BUILD_TIME=2024-10-21 12:57:02 1729515422 + IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-15.1-arm64-arm-64bit PYTHON=Cpython (venv) + EUID=502:20 UID=502:20 PUID=502:20 FS_UID=502:20 FS_PERMS=644 FS_ATOMIC=True FS_REMOTE=False + DEBUG=False IS_TTY=True SUDO=False ID=dfa11485:aa78ad45 SEARCH_BACKEND=ripgrep LDAP=False + + Binary Dependencies: + √ python 3.14.0 venv_pip ~/.venv/bin/python + √ django 6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/__init__.py + √ sqlite 2.6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/base.py + √ pip 24.3.1 venv_pip ~/.venv/bin/pip + ... + validations: + required: true + + - type: dropdown + id: install_method + validations: + required: true + attributes: + label: How did you install the version of ArchiveBox you are using? + multiple: false + options: + - pip + - apt + - brew + - nix + - Docker (or Podman/LXC/K8s/TrueNAS/Proxmox/etc) + - Other + + - type: dropdown + id: operating_system + validations: + required: true + attributes: + label: What operating system are you running on? + description: | + Please note we are *unable to provide support for Windows users* unless you are using [Docker on Windows](https://github.com/ArchiveBox/archivebox#:~:text=windows%20without%20docker). + multiple: false + options: + - Linux (Ubuntu/Debian/Arch/Alpine/etc.) + - macOS (including Docker on macOS) + - BSD (FreeBSD/OpenBSD/NetBSD/etc.) + - Windows (including WSL, WSL2, Docker Desktop on Windows) + - Other + + - type: checkboxes + id: filesystem + attributes: + label: What type of drive are you using to store your ArchiveBox data? + description: Are you using a [remote filesystem](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage#supported-remote-filesystems) or FUSE mount for `data/` OR `data/archive`? + options: + - label: "some of `data/` is on a local SSD or NVMe drive" + required: false + - label: "some of `data/` is on a spinning hard drive or external USB drive" + required: false + - label: "some of `data/` is on a network mount (e.g. NFS/SMB/Ceph/GlusterFS/etc.)" + required: false + - label: "some of `data/` is on a FUSE mount (e.g. SSHFS/RClone/S3/B2/Google Drive/Dropbox/etc.)" + required: false + + + - type: textarea + id: docker_compose_yml + attributes: + label: Docker Compose Configuration + description: "If using Docker Compose, please share your full `docker-compose.yml` file. If using plain Docker, paste the `docker run ...` command you use." + placeholder: | + services: + archivebox: + image: archivebox/archivebox:latest + ports: + - 8000:8000 + volumes: + - ./data:/data + environment: + - ADMIN_USERNAME=admin + - ADMIN_PASSWORD=******** + - ALLOWED_HOSTS=* + - CSRF_TRUSTED_ORIGINS=https://archivebox.example.com + - PUBLIC_INDEX=True + - PUBLIC_SNAPSHOTS=True + - PUBLIC_ADD_VIEW=False + ... + + archivebox_scheduler: + image: archivebox/archivebox:latest + command: schedule --foreground --update --every=day + environment: + ... + + ... + render: shell + validations: + required: false + + - type: textarea + id: configuration + attributes: + label: ArchiveBox Configuration + description: "Please share your full `data/ArchiveBox.conf` file here." + render: shell + placeholder: | + [SERVER_CONFIG] + SECRET_KEY = "*********************" + + WGET_RESTRICT_FILE_NAMES=windows + USE_SYSTEM_WGET=true + CHECK_SSL_VALIDITY=false + ... + validations: + required: false + + + - type: markdown + attributes: + value: | + --- + + We strive to answer issues as quickly as possible, it usually takes us *about a ~week* to respond. + Make sure your `data/` is [**fully backed up**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#disk-layout) before trying anything suggested here, **we are not responsible for data loss**. + + In the meantime please consider: + + - 💰 [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - đŸ‘¨â€âœˆī¸ [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - 🔍 [Searching the Documentation](https://docs.archivebox.io/) for answers to common questions + - 📚 Reading the [Troubleshooting Guide](https://github.com/ArchiveBox/ArchiveBox/wiki) + - ✨ Testing out a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) (issues are often already fixed in our latest `BETA` releases) + diff --git a/.github/ISSUE_TEMPLATE/2-feature_request.yml b/.github/ISSUE_TEMPLATE/2-feature_request.yml new file mode 100644 index 0000000000..7a30d3b9c9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2-feature_request.yml @@ -0,0 +1,128 @@ +name: 💡 Feature or enhancement request +description: Suggest an idea or improvement for this project +title: "Feature Request: ..." +assignees: + - pirate +type: 'Enhancement' +labels: 'status: idea phase' +body: + - type: dropdown + id: suggestion_type + validations: + required: true + attributes: + label: "What type of suggestion are you making?" + multiple: false + options: + - New extractor / type of content to save + - Proposing a new feature + - Modification of existing behavior + - Web UI or UX design improvement + + - type: textarea + id: current_problem + attributes: + label: "What is the problem that your feature request solves?" + description: | + Describe the problem or need that your feature request solves, feel free to include any screenshots or examples. + placeholder: | + e.g. I need to be able to archive spanish and french subtitle files from a particular movie site https://example.com/somevideos that's going down soon. + validations: + required: true + + - type: textarea + id: proposed_solution + attributes: + label: "What is your proposed solution?" + description: | + Describe the ideal specific solution you'd want, *and whether it fits into any broader scope of changes*. + placeholder: | + e.g. I specifically need a new archive method to look for multilingual subtitle files related to pages. + The bigger picture solution is the ability for custom user scripts to be run in a puppeteer context during archiving. + validations: + required: true + + - type: textarea + id: workarounds_tried + attributes: + label: "What hacks or alternative solutions have you tried to solve the problem?" + description: | + A description of any alternative approaches, workarounds, or other solutions you've considered to fix the problem. + placeholder: | + e.g. I wait for archivebox to finish archiving the page, then I manually run `yt-dlp --subs ` inside + the `data/archive//` directory to download the subtitle files and add them to the snapshot folder. + validations: + required: true + + - type: textarea + id: version + attributes: + label: Share the entire output of the `archivebox version` command for the current version you are using. + description: | + DO NOT JUST ENTER "the latest version" OR YOUR ISSUE WILL BE CLOSED. + We need to know what version of ArchiveBox and what feature flags you're currently running with in order to contextualize your feature request. + Sometimes we've already fixed the issues in newer BETA versions, sometimes features already exist but may not be available in your specific environment. + + Run the `archivebox version` command inside your current collection dir and paste the *full output* here (*not just the version number*). + For Docker Compose run: `docker compose run archivebox version` + For plain Docker run: `docker run -v $PWD:/data archivebox/archivebox version` + render: shell + placeholder: | + 0.8.6 + ArchiveBox v0.8.6rc0 COMMIT_HASH=721427a BUILD_TIME=2024-10-21 12:57:02 1729515422 + IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-15.1-arm64-arm-64bit PYTHON=Cpython (venv) + EUID=502:20 UID=502:20 PUID=502:20 FS_UID=502:20 FS_PERMS=644 FS_ATOMIC=True FS_REMOTE=False + DEBUG=False IS_TTY=True SUDO=False ID=dfa11485:aa78ad45 SEARCH_BACKEND=ripgrep LDAP=False + + Binary Dependencies: + √ python 3.14.0 venv_pip ~/.venv/bin/python + √ django 6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/__init__.py + √ sqlite 2.6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/base.py + √ pip 24.3.1 venv_pip ~/.venv/bin/pip + ... + validations: + required: true + + - type: checkboxes + id: priority + attributes: + label: "How badly do you want this new feature?" + options: + - label: "It's an urgent deal-breaker, I can't live without it" + required: false + - label: "It's important to add it in the near-mid term future" + required: false + - label: "It would be nice to have eventually" + required: false + - label: "I'm willing to [start a PR](https://github.com/ArchiveBox/ArchiveBox#archivebox-development) to develop this myself" + required: false + - label: "I have [donated money](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) to go towards fixing this issue" + required: false + + - type: checkboxes + id: satisfaction_survey + attributes: + label: Mini Survey + description: How do you like ArchiveBox so far? + options: + - label: "I like ArchiveBox so far / would recommend it to a friend" + required: false + - label: "I've had a lot of difficulty getting ArchiveBox set up" + required: false + - label: "I would pay $10/mo for a hosted version of ArchiveBox if it had this feature" + required: false + + - type: markdown + attributes: + value: | + --- + + We strive to answer issues as quickly as possible, it usually takes us *about a ~week* to respond. + Make sure your `data/` is [**fully backed up**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#disk-layout) before trying any workarounds or BETAs suggested here, **we are not responsible for data loss**. + + In the meantime please consider: + + - 💰 [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - 📊 [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - 🔍 [Searching the Documentation](https://docs.archivebox.io/) for answers to common questions + - ✨ Testing out a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) (issues are often already fixed in our latest `BETA` releases) diff --git a/.github/ISSUE_TEMPLATE/3-documentation_change.yml b/.github/ISSUE_TEMPLATE/3-documentation_change.yml new file mode 100644 index 0000000000..c711f0897a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3-documentation_change.yml @@ -0,0 +1,52 @@ +name: 📑 Documentation improvement +description: Submit an idea or correction for the Wiki documentation +title: "Documentation: ..." +labels: 'touches: docs' +type: 'Enhancement' +assignees: + - pirate +body: + - type: markdown + attributes: + value: | + If you prefer, you can submit a [Pull Request](https://github.com/ArchiveBox/docs) on https://github.com/ArchiveBox/docs to edit the docs directly instead. + + - type: input + id: page_url + validations: + required: true + attributes: + label: "What is the URL of the page you'd like to see improved?" + placeholder: e.g. https://github.com/ArchiveBox/docs/wiki/Install + + - type: input + id: section_title + validations: + required: true + attributes: + label: "What is the title of the relevant section?" + placeholder: e.g. Option B. Automatic Setup Script + + - type: textarea + id: suggested_edit + attributes: + label: "What is the suggested edit?" + placeholder: | + e.g. Please document how to run the automatic setup script for ArchiveBox on TempleOS. + Attach images, screenshots, code snippets, etc. anything you think would help. + validations: + required: true + + - type: markdown + attributes: + value: | + --- + + We strive to address issues as quickly as possible, it usually takes us *about a ~week* to respond. + + In the meantime please consider: + + - 💰 [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - đŸ‘¨â€âœˆī¸ [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - 🔍 [Checking out the new ReadTheDocs Documentation](https://docs.archivebox.io/) + - ✨ Helping us test a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 086e3d7b20..0000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -name: 🐞 Bug report -about: Create a report to help us improve -title: 'Bug: ...' -labels: 'bug' -assignees: '' - ---- - - - -#### Describe the bug - - -#### Steps to reproduce - - -#### Screenshots or log output - - - -#### ArchiveBox version - - -```logs -replace this line with the *full*, unshortened output of running `archivebox version` -``` - diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..110053ccbc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,11 @@ +blank_issues_enabled: false +contact_links: + - name: ❓ Ask a question or start a discussion + url: https://github.com/ArchiveBox/ArchiveBox/discussions + about: "Ask a question, get support, or start a design discussion (to report a problem please use '🐞 Bug report' instead)" + - name: đŸ’Ŧ Chat with the dev team & community on Zulip + url: https://zulip.archivebox.io + about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)." + - name: đŸ’â€â™‚ī¸ Hire us for professional support with fast response times + url: https://docs.monadical.com/s/archivebox-consulting-services + about: "We provide hosting, development, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc." diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md deleted file mode 100644 index a02e9374da..0000000000 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -name: 📑 Documentation change -about: Submit a suggestion for the Wiki documentation -title: 'Documentation: Improvement request ...' -labels: '' -assignees: '' - ---- - -## Wiki Page URL - - - -## Suggested Edit - - diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 5378139f0c..0000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -name: 💡 Feature request -about: Suggest an idea for this project -title: 'Feature Request: ...' -labels: 'changes: behavior,status: idea phase' -assignees: '' - ---- - - - -## Type - - - [ ] General question or discussion - - [ ] Propose a brand new feature - - [ ] Request modification of existing behavior or design - -## What is the problem that your feature request solves - - -## Describe the ideal specific solution you'd want, and whether it fits into any broader scope of changes - - -## What hacks or alternative solutions have you tried to solve the problem? - - -## How badly do you want this new feature? - - - [ ] It's an urgent deal-breaker, I can't live without it - - [ ] It's important to add it in the near-mid term future - - [ ] It would be nice to have eventually - ---- - - - [ ] I'm willing to contribute [dev time](https://github.com/ArchiveBox/ArchiveBox#archivebox-development) / [money](https://github.com/sponsors/pirate) to fix this issue - - [ ] I like ArchiveBox so far / would recommend it to a friend - - [ ] I've had a lot of difficulty getting ArchiveBox set up diff --git a/.github/ISSUE_TEMPLATE/question_or_discussion.md b/.github/ISSUE_TEMPLATE/question_or_discussion.md deleted file mode 100644 index 4b7fb02f36..0000000000 --- a/.github/ISSUE_TEMPLATE/question_or_discussion.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -name: đŸ’Ŧ Question, discussion, or support request -about: Start a discussion or ask a question about ArchiveBox -title: 'Question: ...' -labels: '' -assignees: '' - ---- - diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000000..8fae71e187 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,34 @@ +# Security Policy + +--- + +## Security Information + +Please see this wiki page for important notices about ArchiveBox security, publishing your archives securely, and the dangers of executing archived JS: + +https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview + +Also see this section of the README about important caveats when running ArchiveBox: + +https://github.com/ArchiveBox/ArchiveBox?tab=readme-ov-file#caveats + +You can also read these pages for more information about ArchiveBox's internals, development environment, DB schema, and more: + +- https://github.com/ArchiveBox/ArchiveBox#archive-layout +- https://github.com/ArchiveBox/ArchiveBox#archivebox-development +- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives +- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting + +--- + +## Reporting a Vulnerability + +We use Github's built-in [Private Reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature to accept vulnerability reports. + +1. Go to the Security tab on our Github repo: https://github.com/ArchiveBox/ArchiveBox/security + +2. Click the ["Report a Vulnerability"](https://github.com/ArchiveBox/ArchiveBox/security/advisories/new) button + +3. Fill out the form to submit the details of the report and it will be securely sent to the maintainers + +You can also contact the maintainers via our public [Zulip Chat Server zulip.archivebox.io](https://zulip.archivebox.io) or [Twitter DMs @ArchiveBoxApp](https://twitter.com/ArchiveBoxApp). diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..edc253a66e --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,25 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + target-branch: "dev" + schedule: + interval: "monthly" + groups: + pip: + patterns: + - "*" + - package-ecosystem: "npm" + directory: "/" + target-branch: "dev" + schedule: + interval: "monthly" + groups: + npm: + patterns: + - "*" diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml new file mode 100644 index 0000000000..2feee2e38c --- /dev/null +++ b/.github/workflows/claude.yml @@ -0,0 +1,49 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. + # prompt: 'Update the pull request description to include a summary of changes.' + + # Optional: Add claude_args to customize behavior and configuration + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://code.claude.com/docs/en/cli-reference for available options + claude_args: '--allowed-tools Bash(gh pr:*)' diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index 66e331b20c..0000000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: "CodeQL" - -on: - push: - branches: [ dev ] - pull_request: - branches: [ dev ] - schedule: - - cron: '43 1 * * 2' - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - language: [ 'python' ] - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Initialize CodeQL - uses: github/codeql-action/init@v1 - with: - languages: ${{ matrix.language }} - queries: security-extended - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..a6d4e2764f --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,92 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "dev" ] + pull_request: + branches: [ "dev" ] + schedule: + - cron: '33 17 * * 6' + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 6492f020f9..229589aad9 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -2,76 +2,209 @@ name: Build Debian package on: workflow_dispatch: + workflow_call: push: + branches: ['**'] + paths: + - 'pkg/debian/**' + - 'bin/build_deb.sh' + - 'bin/release_deb.sh' + - '.github/workflows/debian.yml' + - 'pyproject.toml' + # release trigger is handled by release.yml to avoid double-runs -env: - DEB_BUILD_OPTIONS: nocheck +permissions: + contents: write jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 + strategy: + matrix: + arch: [amd64, arm64] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: - submodules: true fetch-depth: 1 - - name: Install packaging dependencies + - name: Get version + id: version + run: echo "version=$(grep '^version = ' pyproject.toml | awk -F'\"' '{print $2}')" >> "$GITHUB_OUTPUT" + + - name: Install nfpm + env: + NFPM_VERSION: "2.45.1" + run: | + curl -fsSL "https://github.com/goreleaser/nfpm/releases/download/v${NFPM_VERSION}/nfpm_${NFPM_VERSION}_Linux_x86_64.tar.gz" \ + | sudo tar -xz -C /usr/local/bin nfpm + nfpm --version + + - name: Build .deb package + run: | + export VERSION="${{ steps.version.outputs.version }}" + export ARCH="${{ matrix.arch }}" + ./bin/build_deb.sh + + - name: Verify .deb package contents + run: | + DEB_FILE="$(ls dist/archivebox*.deb | head -1)" + echo "=== Package info ===" + dpkg-deb --info "$DEB_FILE" + echo "" + echo "=== Package contents ===" + dpkg-deb --contents "$DEB_FILE" + echo "" + echo "=== Control fields ===" + dpkg-deb --field "$DEB_FILE" + + - name: Upload .deb artifact + uses: actions/upload-artifact@v4 + with: + name: archivebox-${{ steps.version.outputs.version }}-${{ matrix.arch }}.deb + path: dist/*.deb + + test: + needs: build + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + runner: ubuntu-24.04 + - arch: arm64 + runner: ubuntu-24.04-arm + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + enable-cache: true + + - name: Install build dependencies + uses: awalsh128/cache-apt-pkgs-action@acb598e5ddbc6f68a970c5da0688d2f3a9f04d05 # v1.6.0 + with: + packages: build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 + version: 1.0 + + - name: Build local wheel + run: | + uv sync --locked --all-extras --no-install-project --no-install-workspace --no-sources + uv build --wheel --out-dir /tmp/wheels/ + + - name: Download .deb artifact + uses: actions/download-artifact@v4 + with: + pattern: archivebox-*-${{ matrix.arch }}.deb + merge-multiple: true + + - name: Install system dependencies + run: | + sudo apt-get install -y python3.13 python3.13-venv python3-pip git curl wget + + - name: Pre-seed virtualenv with local wheel before dpkg install + run: | + # CI-only: pre-seed the venv with a local wheel so we test the + # unreleased code, not whatever is on PyPI. We explicitly use python3.13 + # here (matching the system dep installed above) to ensure the venv is + # created with the correct Python version. On real installs, install.sh + # handles this by preferring python3.13 and failing if < 3.13. + # When postinstall.sh runs during dpkg -i, it finds the venv already + # populated and just upgrades deps. + sudo mkdir -p /opt/archivebox + sudo python3.13 -m venv /opt/archivebox/venv + sudo /opt/archivebox/venv/bin/python3 -m pip install --quiet --upgrade pip setuptools + sudo /opt/archivebox/venv/bin/pip install --quiet /tmp/wheels/archivebox-*.whl + echo "[√] Pre-seeded /opt/archivebox/venv with local wheel" + + - name: Install .deb package + run: | + # dpkg install will run postinstall which creates the user, + # sets up systemd, and tries pip install (which finds it already installed) + sudo dpkg -i archivebox*.deb || sudo apt-get install -f -y + + - name: Verify archivebox is installed + run: | + which archivebox + archivebox version + + - name: Verify wrapper script works run: | - sudo apt-get update -qq - sudo apt-get install -y \ - python3 python3-dev python3-pip python3-venv python3-all \ - dh-python debhelper devscripts dput software-properties-common \ - python3-distutils python3-setuptools python3-wheel python3-stdeb + /usr/bin/archivebox version + /usr/bin/archivebox --help | head -5 - - name: Build Debian/Apt sdist_dsc + - name: Test archivebox init as archivebox user run: | - rm -Rf deb_dist/* - python3 setup.py --command-packages=stdeb.command sdist_dsc + # The postinstall should have created the user + id archivebox + sudo mkdir -p /tmp/archivebox-test + sudo chown archivebox:archivebox /tmp/archivebox-test + sudo -u archivebox bash -c 'cd /tmp/archivebox-test && /opt/archivebox/venv/bin/archivebox init' - - name: Build Debian/Apt bdist_deb + - name: Test archivebox status run: | - python3 setup.py --command-packages=stdeb.command bdist_deb + sudo -u archivebox bash -c 'cd /tmp/archivebox-test && /opt/archivebox/venv/bin/archivebox status' - - name: Install archivebox from deb + - name: Test archivebox add run: | - cd deb_dist/ - sudo apt-get install ./archivebox*.deb + sudo -u archivebox bash -c 'cd /tmp/archivebox-test && /opt/archivebox/venv/bin/archivebox add "https://example.com"' - - name: Check ArchiveBox version + - name: Verify systemd service file exists run: | - # must create dir needed for snaps to run as non-root on github actions - sudo mkdir -p /run/user/1001 && sudo chmod -R 777 /run/user/1001 - mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data" - archivebox init - archivebox config --set SAVE_READABILITY=False - archivebox config --set SAVE_MERCURY=False - archivebox config --set SAVE_SINGLEFILE=False - archivebox --version - - - name: Add some links to test + test -f /usr/lib/systemd/system/archivebox.service + cat /usr/lib/systemd/system/archivebox.service + + # Upload .deb to GitHub Release only after tests pass + release: + if: github.event_name == 'release' || github.event_name == 'workflow_call' + needs: [build, test] + runs-on: ubuntu-24.04 + permissions: + contents: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Determine release tag + id: tag + run: | + TAG="${{ github.event.release.tag_name }}" + if [ -z "$TAG" ]; then + TAG="v$(grep '^version = ' pyproject.toml | awk -F'\"' '{print $2}')" + echo "[i] No release tag in event context, using version from pyproject.toml: $TAG" + fi + echo "tag=$TAG" >> "$GITHUB_OUTPUT" + + - name: Download all .deb artifacts + uses: actions/download-artifact@v4 + with: + pattern: archivebox-*.deb + merge-multiple: true + + - name: Upload .deb to GitHub Release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - cd "${{ github.workspace }}/data" - archivebox add 'https://example.com' - archivebox status - - # - name: Commit built package - # run: | - # cd deb_dist/ - # git config --local user.email "action@github.com" - # git config --local user.name "GitHub Action" - # git commit -m "Debian package autobuild" -a - - # - name: Push build to Github - # uses: ad-m/github-push-action@master - # with: - # github_token: ${{ secrets.GITHUB_TOKEN }} - # repository: ArchiveBox/debian-archivebox - # branch: ${{ github.ref }} - # directory: deb_dist - - # - name: Push build to Launchpad PPA - # run: | - # debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" - # dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" + TAG="${{ steps.tag.outputs.tag }}" + # Verify the release exists before uploading + if ! gh release view "$TAG" >/dev/null 2>&1; then + echo "[!] No GitHub Release found for tag $TAG." + if [ -n "${{ github.event.release.tag_name }}" ]; then + echo "[X] This was triggered by a release event — the release should exist. Failing." + exit 1 + fi + echo "[i] Skipping upload (workflow_dispatch without a release)." + echo " Create a release first or trigger via the release event." + exit 0 + fi + gh release upload "$TAG" *.deb --clobber diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index ac080b4f1a..ce0da51722 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -2,6 +2,7 @@ name: Build Docker image on: workflow_dispatch: + workflow_call: push: branches: - '**' @@ -12,73 +13,117 @@ on: env: DOCKER_IMAGE: archivebox-ci - jobs: buildx: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 - with: - submodules: true - fetch-depth: 1 + uses: actions/checkout@v4 + # with: + # submodules: true + # fetch-depth: 1 - name: Set up QEMU - uses: docker/setup-qemu-action@v1 - + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx id: buildx - uses: docker/setup-buildx-action@v1 + uses: docker/setup-buildx-action@v3 with: version: latest install: true - + platforms: linux/amd64,linux/arm64 + - name: Builder instance name run: echo ${{ steps.buildx.outputs.name }} - + - name: Available platforms run: echo ${{ steps.buildx.outputs.platforms }} - + - name: Cache Docker layers - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} restore-keys: | ${{ runner.os }}-buildx- - - name: Docker Login - uses: docker/login-action@v1 + - name: Login to Docker Hub + uses: docker/login-action@v3 if: github.event_name != 'pull_request' with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Collect Docker tags + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Collect Full Release Docker tags + # https://github.com/docker/metadata-action id: docker_meta - uses: crazy-max/ghaction-docker-meta@v2 + uses: docker/metadata-action@v5 + if: github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' with: - images: archivebox/archivebox,nikisweeting/archivebox - flavor: | - latest=auto + images: archivebox/archivebox,ghcr.io/archivebox/archivebox tags: | + # :stable type=ref,event=branch + # :0.7.3 type=semver,pattern={{version}} + # :0.7 type=semver,pattern={{major}}.{{minor}} + # :sha-463ea54 type=sha - + # :latest + type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'stable') }} + + - name: Collect Non-Release Docker tags + # https://github.com/docker/metadata-action + id: docker_meta_non_release + uses: docker/metadata-action@v5 + if: github.event_name != 'workflow_dispatch' && github.event_name != 'workflow_call' + with: + images: archivebox/archivebox,ghcr.io/archivebox/archivebox + tags: | + # :stable + type=ref,event=branch + # :sha-463ea54 + type=sha + - name: Build and push id: docker_build - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v5 with: context: ./ file: ./Dockerfile builder: ${{ steps.buildx.outputs.name }} push: ${{ github.event_name != 'pull_request' }} - tags: ${{ steps.docker_meta.outputs.tags }} + tags: ${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && steps.docker_meta.outputs.tags || steps.docker_meta_non_release.outputs.tags }} + labels: ${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call') && steps.docker_meta.outputs.labels || steps.docker_meta_non_release.outputs.labels }} cache-from: type=local,src=/tmp/.buildx-cache - cache-to: type=local,dest=/tmp/.buildx-cache - platforms: linux/amd64,linux/arm64,linux/arm/v7 + cache-to: type=local,dest=/tmp/.buildx-cache-new + platforms: linux/amd64,linux/arm64 - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} + + - name: Update README + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + repository: archivebox/archivebox + + # This ugly bit is necessary if you don't want your cache to grow forever + # until it hits GitHub's limit of 5GB. + # Temp fix + # https://github.com/docker/build-push-action/issues/252 + # https://github.com/moby/buildkit/issues/1896 + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache diff --git a/.github/workflows/duplicate-issue-detection.yml b/.github/workflows/duplicate-issue-detection.yml new file mode 100644 index 0000000000..98dcd8394a --- /dev/null +++ b/.github/workflows/duplicate-issue-detection.yml @@ -0,0 +1,59 @@ +name: Duplicate Issue Detection + +on: + issues: + types: [opened] + +jobs: + check-duplicates: + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Install opencode + run: curl -fsSL https://opencode.ai/install | bash + + - name: Check for duplicate issues + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENCODE_PERMISSION: | + { + "bash": { + "gh issue*": "allow", + "gh pr*": "allow", + "*": "deny" + }, + "webfetch": "allow" + } + run: | + opencode run -m anthropic/claude-haiku-4-5 "A new issue has been created: + + Issue number: + ${{ github.event.issue.number }} + + Lookup this issue and search through existing issues and PRs (excluding #${{ github.event.issue.number }}) in this repository to find any potential duplicates of this new issue. + Consider: + 1. Similar titles or descriptions + 2. Same error messages or symptoms + 3. Related functionality or components + 4. Similar feature requests + + If you find any potential duplicates, please comment on the new issue with: + - A brief explanation of why it might be a duplicate + - Links to the potentially duplicate issues or PRs + - A suggestion to check those issues first + + Use this format for the comment: + 'This issue might be a duplicate of existing issues. Please check: + - #[issue_number]: [brief description of similarity] + + Feel free to ignore if none of these address your specific case.' + + If no clear duplicates are found, do not comment." diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml new file mode 100644 index 0000000000..751f71aa42 --- /dev/null +++ b/.github/workflows/gh-pages.yml @@ -0,0 +1,64 @@ +# Simple workflow for deploying static content to GitHub Pages +name: Deploy static content to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: ["dev"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + - name: Copy README.md into place + run: | + rm -f ./website/README.md + cp ./README.md ./website/README.md + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./website + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # - name: Checkout + # uses: actions/checkout@v4 + # - name: Setup Pages + # uses: actions/configure-pages@v5 + # - name: Upload artifact + # uses: actions/upload-pages-artifact@v3 + # with: + # # Upload entire repository + # path: './website' + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index d9bb05f1a7..95432efdbf 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -1,50 +1,229 @@ -name: Build Homebrew package +name: Build Homebrew formula on: workflow_dispatch: + workflow_call: push: + branches: ['**'] + paths: + - 'brew_dist/**' + - 'bin/build_brew.sh' + - 'bin/release_brew.sh' + - '.github/workflows/homebrew.yml' + - 'pyproject.toml' + # release trigger is handled by release.yml to avoid double-runs +permissions: + contents: read jobs: - build: - runs-on: macos-latest + build-and-test: + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-24.04] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: - submodules: true fetch-depth: 1 - # TODO: modify archivebox.rb to update src url, hashes, and dependencies + - name: Get version + id: version + run: echo "version=$(grep '^version = ' pyproject.toml | awk -F'\"' '{print $2}')" >> "$GITHUB_OUTPUT" + + - name: Validate formula template syntax + run: ruby -c brew_dist/archivebox.rb + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + enable-cache: true + + - name: Install build dependencies (Linux) + if: runner.os == 'Linux' + uses: awalsh128/cache-apt-pkgs-action@acb598e5ddbc6f68a970c5da0688d2f3a9f04d05 # v1.6.0 + with: + packages: build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 + version: 1.0 + + - name: Build local sdist + run: | + uv sync --locked --all-extras --no-install-project --no-install-workspace --no-sources + uv build --sdist --out-dir /tmp/sdist/ + + - name: Generate formula from local sdist + run: | + VERSION="${{ steps.version.outputs.version }}" + SDIST_PATH="$(ls /tmp/sdist/archivebox-*.tar.gz | head -1)" + SDIST_SHA256="$(shasum -a 256 "$SDIST_PATH" | awk '{print $1}')" + + # Install archivebox + poet into a temp venv to generate resource stanzas + python3 -m venv /tmp/poet-venv + source /tmp/poet-venv/bin/activate + pip install --quiet "$SDIST_PATH" homebrew-pypi-poet + echo "[+] Generating resource stanzas with homebrew-pypi-poet..." + RESOURCES="$(poet archivebox)" + deactivate + + # For CI: use file:// URL pointing to local sdist + # For release: this gets overridden with the PyPI URL + SDIST_URL="file://${SDIST_PATH}" + + cat > /tmp/archivebox.rb << RUBY +class Archivebox < Formula + include Language::Python::Virtualenv + + desc "Self-hosted internet archiving solution" + homepage "https://github.com/ArchiveBox/ArchiveBox" + url "${SDIST_URL}" + sha256 "${SDIST_SHA256}" + license "MIT" + + depends_on "python@3.13" + +${RESOURCES} + + def install + virtualenv_install_with_resources + end + + def post_install + data_dir = var/"archivebox" + data_dir.mkpath + ENV["DATA_DIR"] = data_dir.to_s + system bin/"archivebox", "init" + end + + def caveats + <<~EOS + ArchiveBox data is stored in: + #{var}/archivebox + + To start archiving, run: + cd #{var}/archivebox && archivebox add 'https://example.com' + + To start the web UI: + cd #{var}/archivebox && archivebox server 0.0.0.0:8000 + EOS + end + + test do + assert_match version.to_s, shell_output("#{bin}/archivebox version") + end +end +RUBY + + echo "[√] Generated formula:" + ruby -c /tmp/archivebox.rb + cat /tmp/archivebox.rb + + - name: Install Homebrew (Linux only) + if: runner.os == 'Linux' + run: | + NONINTERACTIVE=1 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" + echo "HOMEBREW_PREFIX=$HOMEBREW_PREFIX" >> "$GITHUB_ENV" + echo "HOMEBREW_CELLAR=$HOMEBREW_CELLAR" >> "$GITHUB_ENV" + echo "HOMEBREW_REPOSITORY=$HOMEBREW_REPOSITORY" >> "$GITHUB_ENV" + echo "$HOMEBREW_PREFIX/bin" >> "$GITHUB_PATH" + echo "$HOMEBREW_PREFIX/sbin" >> "$GITHUB_PATH" + + - name: Install brew dependencies + run: brew install python@3.13 - - name: Build Homebrew Bottle + - name: Install archivebox via brew from local formula run: | - pip3 install --upgrade pip setuptools wheel - cd brew_dist/ - brew install --build-bottle ./archivebox.rb - # brew bottle archivebox + brew install --build-from-source --verbose /tmp/archivebox.rb - - name: Add some links to test + - name: Verify archivebox version + run: archivebox version + + - name: Test archivebox init run: | - mkdir data && cd data + mkdir -p /tmp/archivebox-test && cd /tmp/archivebox-test archivebox init - archivebox add 'https://example.com' - archivebox version + + - name: Test archivebox status + run: | + cd /tmp/archivebox-test archivebox status - # - name: Commit built package - # run: | - # cd brew_dist/ - # git config --local user.email "action@github.com" - # git config --local user.name "GitHub Action" - # git commit -m "Homebrew package autobuild" -a - - # - name: Push build to Github - # uses: ad-m/github-push-action@master - # with: - # github_token: ${{ secrets.GITHUB_TOKEN }} - # repository: ArchiveBox/homebrew-archivebox - # branch: ${{ github.ref }} - # directory: brew_dist - - # TODO: push bottle homebrew core PR with latest changes + # On release only: generate the real formula with PyPI URL and push to tap + release: + if: github.event_name == 'release' || github.event_name == 'workflow_call' + needs: build-and-test + runs-on: macos-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Get version + id: version + run: echo "version=$(grep '^version = ' pyproject.toml | awk -F'\"' '{print $2}')" >> "$GITHUB_OUTPUT" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Wait for PyPI package availability + run: | + VERSION="${{ steps.version.outputs.version }}" + echo "[+] Waiting for archivebox==${VERSION} to be available on PyPI..." + for i in $(seq 1 30); do + if pip index versions archivebox 2>/dev/null | grep -q "$VERSION"; then + echo "[√] archivebox==${VERSION} is available on PyPI" + break + fi + if [ "$i" -eq 30 ]; then + echo "[!] Timed out waiting for PyPI. Trying to install anyway..." + break + fi + echo " Attempt $i/30 - not yet available, waiting 30s..." + sleep 30 + done + + - name: Generate release formula via build_brew.sh + run: ./bin/build_brew.sh + + - name: Test formula install + run: | + brew install --build-from-source brew_dist/archivebox.rb + archivebox version + + - name: Upload formula artifact + uses: actions/upload-artifact@v4 + with: + name: archivebox.rb + path: brew_dist/archivebox.rb + + - name: Push to homebrew-archivebox tap + env: + GH_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} + run: | + VERSION="${{ steps.version.outputs.version }}" + git clone "https://x-access-token:${GH_TOKEN}@github.com/ArchiveBox/homebrew-archivebox.git" /tmp/tap + + cp brew_dist/archivebox.rb /tmp/tap/Formula/archivebox.rb 2>/dev/null || \ + cp brew_dist/archivebox.rb /tmp/tap/archivebox.rb + + cd /tmp/tap + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add -A + if git diff --cached --quiet; then + echo "[i] No changes to formula, skipping push." + else + git commit -m "Update archivebox to v${VERSION}" + git push origin HEAD + echo "[√] Formula pushed to homebrew-archivebox tap" + fi diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 0000000000..0c6fb5681f --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,58 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Build GitHub Pages website + +on: + # Runs on pushes targeting the default branch + push: + branches: ["dev"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + - name: Copy README.md into place + run: | + rm ./website/README.md + cp ./README.md ./website/README.md + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./website + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 80f4f19f13..cf756de056 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -5,30 +5,42 @@ on: push: env: - MAX_LINE_LENGTH: 110 + UV_NO_SOURCES: "1" + PYTHONPATH: ${{ github.workspace }}/abx-pkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl jobs: lint: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 + - name: Clone abx-pkg + run: git clone --depth=1 https://github.com/ArchiveBox/abx-pkg.git abx-pkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: "3.13" architecture: x64 - - name: Install flake8 - run: | - pip install flake8 + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true - - name: Lint with flake8 + - name: Install dependencies with uv run: | - # one pass for show-stopper syntax errors or undefined names - flake8 archivebox --count --show-source --statistics - # one pass for small stylistic things - flake8 archivebox --count --max-line-length="$MAX_LINE_LENGTH" --statistics + uv sync --all-extras --all-groups --no-sources --no-cache + uv pip install -e ./abx-pkg -e ./abx-plugins[dev] -e ./abx-dl + + - name: Run prek + run: uv run --no-sync --no-sources prek run --all-files diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml old mode 100644 new mode 100755 index 7c2d341d22..dce9cb73e2 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -2,61 +2,64 @@ name: Build Pip package on: workflow_dispatch: + workflow_call: push: + branches: + - '**' + tags: + - 'v*' +env: + PYTHON_VERSION: "3.13" jobs: build: - runs-on: ubuntu-20.04 + permissions: + id-token: write + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 with: - submodules: true - fetch-depth: 1 + enable-cache: true - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: ${{ env.PYTHON_VERSION }} architecture: x64 - - name: Build Python Package - run: | - pip3 install --upgrade pip setuptools wheel - rm -Rf pip_dist/*.whl - python3 setup.py \ - sdist --dist-dir=./pip_dist \ - bdist_wheel --dist-dir=./pip_dist \ - egg_info --egg-base=./pip_dist - pip install pip_dist/archivebox-*.whl - - - name: Add some links to test + - name: APT install archivebox dev + run dependencies + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.0 + + - name: UV install archivebox dev + run sub-dependencies + run: uv sync --locked --all-extras --no-install-project --no-install-workspace --no-sources + + - name: UV build archivebox and archivebox/pkgs/* packages run: | - mkdir data && cd data - archivebox init - archivebox add 'https://example.com' - archivebox version - archivebox status - - # - name: Push build to PyPI - # run: | - # cd pip_dist/ - # python3 -m twine upload --repository testpypi pip_dist/*.{whl,tar.gz} - # python3 -m twine upload --repository pypi pip_dist/*.{whl,tar.gz} - - # - name: Commit built package - # run: | - # cd pip_dist/ - # git config --local user.email "action@github.com" - # git config --local user.name "GitHub Action" - # git commit -m "Pip package autobuild" -a - - # - name: Push build to Github - # uses: ad-m/github-push-action@master - # with: - # github_token: ${{ secrets.GITHUB_TOKEN }} - # repository: ArchiveBox/pip-archivebox - # branch: ${{ github.ref }} - # directory: pip_dist + uv build --all + - name: Publish new package wheels and sdists to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + # ignore when publish to PyPI fails due to duplicate tag + continue-on-error: true + with: + password: ${{ secrets.PYPI_PAT_SECRET }} + + - name: UV install archivebox and archivebox/pkgs/* locally for tests + run: uv sync --locked --all-extras --no-sources + + - name: UV run archivebox init + archivebox version + run: | + mkdir -p data && cd data + uv run --no-sync --no-sources archivebox init \ + && uv run --no-sync --no-sources archivebox version + # && uv run archivebox add 'https://example.com' \ + # && uv run archivebox status \ + # || (echo "UV Failed to run archivebox!" && exit 1) diff --git a/.github/workflows/release-runner.yml b/.github/workflows/release-runner.yml new file mode 100644 index 0000000000..e9dd3ac444 --- /dev/null +++ b/.github/workflows/release-runner.yml @@ -0,0 +1,45 @@ +name: Release State + +on: + push: + branches: + - '**' + workflow_dispatch: + +permissions: + contents: write + id-token: write + +jobs: + release-state: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: true + ref: ${{ github.ref_name }} + + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Configure git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Run release script + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_TOKEN: ${{ github.token }} + PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }} + run: ./bin/release.sh diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000..032127aeb7 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,42 @@ +name: Release + +# Orchestrates the full release pipeline: +# 1. Build and publish pip package to PyPI +# 2. Build .deb packages and Homebrew formula (in parallel, after pip) +# 3. Build Docker images (in parallel with deb/brew) +# +# Individual workflows also run on push for CI (see their own triggers). +# This workflow ensures the correct ordering during a release. + +on: + release: + types: [published] + +permissions: + contents: write + packages: write + id-token: write + +jobs: + pip: + name: Publish to PyPI + uses: ./.github/workflows/pip.yml + secrets: inherit + + debian: + name: Build .deb packages + needs: pip + uses: ./.github/workflows/debian.yml + secrets: inherit + + homebrew: + name: Update Homebrew formula + needs: pip + uses: ./.github/workflows/homebrew.yml + secrets: inherit + + docker: + name: Build Docker images + needs: pip + uses: ./.github/workflows/docker.yml + secrets: inherit diff --git a/.github/workflows/test-parallel.yml b/.github/workflows/test-parallel.yml new file mode 100644 index 0000000000..adc03a6fd5 --- /dev/null +++ b/.github/workflows/test-parallel.yml @@ -0,0 +1,183 @@ +name: Parallel Tests + +on: + pull_request: + branches: [dev, main, master] + push: + branches: [dev] + +env: + PYTHONIOENCODING: utf-8 + PYTHONLEGACYWINDOWSSTDIO: utf-8 + USE_COLOR: False + UV_NO_SOURCES: "1" + +jobs: + discover-tests: + name: Discover test files + runs-on: ubuntu-22.04 + outputs: + test-files: ${{ steps.set-matrix.outputs.test-files }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Discover test files + id: set-matrix + run: | + # Find all main test files + all_tests=$(find archivebox/tests -maxdepth 1 -name "test_*.py" -type f | sort) + + # Create JSON array with test file info + json_array="[" + first=true + for test_file in $all_tests; do + if [ "$first" = true ]; then + first=false + else + json_array+="," + fi + + # Extract a display name for the test + name="main/$(basename $test_file .py | sed 's/^test_//')" + + json_array+="{\"path\":\"$test_file\",\"name\":\"$name\"}" + done + json_array+="]" + + echo "test-files=$json_array" >> $GITHUB_OUTPUT + echo "Found $(echo $all_tests | wc -w) test files" + echo "$json_array" | jq '.' + + run-tests: + name: ${{ matrix.test.name }} + runs-on: ubuntu-22.04 + needs: discover-tests + env: + PYTHONPATH: ${{ github.workspace }}/abx-pkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + + strategy: + fail-fast: false + matrix: + test: ${{ fromJson(needs.discover-tests.outputs.test-files) }} + python: ["3.13"] + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abx-pkg + run: git clone --depth=1 https://github.com/ArchiveBox/abx-pkg.git abx-pkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Cache uv + uses: actions/cache@v3 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.python }}-uv- + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.1 + + - name: Install dependencies with uv + run: | + uv sync --dev --all-extras --no-sources + uv pip install -e ./abx-pkg -e ./abx-plugins[dev] -e ./abx-dl + + - name: Run test - ${{ matrix.test.name }} + run: | + mkdir -p tests/out + uv run --no-sync --no-sources pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs + + plugin-tests: + name: Plugin tests + runs-on: ubuntu-22.04 + env: + PYTHONPATH: ${{ github.workspace }}/abx-pkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abx-pkg + run: git clone --depth=1 https://github.com/ArchiveBox/abx-pkg.git abx-pkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + - name: Set up Python 3.13 + uses: actions/setup-python@v4 + with: + python-version: "3.13" + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Cache uv + uses: actions/cache@v3 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-3.13-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }} + restore-keys: | + ${{ runner.os }}-3.13-uv- + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.1 + + - name: Install dependencies with uv + run: | + uv sync --dev --all-extras --no-sources + uv pip install -e ./abx-pkg -e ./abx-plugins[dev] -e ./abx-dl + + - name: Run plugin tests + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} + API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} + run: | + uv run --no-sync --no-sources bash ./bin/test_plugins.sh --no-coverage diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml old mode 100644 new mode 100755 index 50680030f3..bd77bc4728 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,75 +6,70 @@ env: PYTHONIOENCODING: utf-8 PYTHONLEGACYWINDOWSSTDIO: utf-8 USE_COLOR: False + UV_NO_SOURCES: "1" jobs: python_tests: runs-on: ${{ matrix.os }} + env: + PYTHONPATH: ${{ github.workspace }}/abx-pkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl strategy: matrix: - os: [ubuntu-20.04, macos-latest, windows-latest] - python: [3.7] + os: [ubuntu-22.04] + # os: [ubuntu-22.04, macos-latest, windows-latest] + python: ["3.13"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 + - name: Clone abx-pkg + run: git clone --depth=1 https://github.com/ArchiveBox/abx-pkg.git abx-pkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + ### Setup Python & JS Languages - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} architecture: x64 - - name: Set up Node JS 14.7.0 - uses: actions/setup-node@v1 + - name: Install uv + uses: astral-sh/setup-uv@v4 with: - node-version: 14.7.0 + version: "latest" - ### Install Python & JS Dependencies - - name: Get pip cache dir - id: pip-cache - run: | - echo "::set-output name=dir::$(pip cache dir)" + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 - - name: Cache pip - uses: actions/cache@v2 - id: cache-pip + ### Install Python & JS Dependencies + - name: Cache uv + uses: actions/cache@v3 with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }} + path: ~/.cache/uv + key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }} restore-keys: | - ${{ runner.os }}-${{ matrix.python }}-venv- + ${{ runner.os }}-${{ matrix.python }}-uv- - - name: Install pip dependencies - run: | - python -m pip install --upgrade pip setuptools wheel pytest bottle - ./bin/build_pip.sh - python -m pip install . - - - name: Get npm cache dir - id: npm-cache - run: | - echo "::set-output name=dir::$GITHUB_WORKSPACE/node_modules" - - - name: Cache npm - uses: actions/cache@v2 - id: cache-npm + - uses: awalsh128/cache-apt-pkgs-action@latest with: - path: ${{ steps.npm-cache.outputs.dir }} - key: ${{ runner.os }}-node_modules-${{ hashFiles('package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node_modules + packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.0 - - name: Install npm requirements + - name: Install dependencies with uv run: | - npm install - echo "SINGLEFILE_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/single-file" >> $GITHUB_ENV - echo "READABILITY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/readability-extractor" >> $GITHUB_ENV - echo "MERCURY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/mercury-parser" >> $GITHUB_ENV + uv sync --dev --all-extras --no-sources + uv pip install -e ./abx-pkg -e ./abx-plugins[dev] -e ./abx-dl ### Run the tests - name: Directory listing for debugging @@ -84,19 +79,30 @@ jobs: - name: Archivebox version run: | - archivebox version + mkdir -p tests/out/data + DATA_DIR="$PWD/tests/out/data" uv run --no-sync --no-sources archivebox version - name: Test built package with pytest # TODO: remove this exception for windows once we get tests passing on that platform if: ${{ !contains(matrix.os, 'windows') }} run: | - python -m pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist + mkdir -p tests/out + uv run --no-sync --no-sources pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs + + - name: Run plugin tests + if: ${{ !contains(matrix.os, 'windows') }} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} + API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} + run: | + uv run --no-sync --no-sources bash ./bin/test_plugins.sh --no-coverage docker_tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 diff --git a/.gitignore b/.gitignore index a80c30ba80..8d4be0cc6a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,21 +6,54 @@ __pycache__/ .eggs/ tests/out/ +# Coverage +.coverage +.coverage.* +coverage.json +coverage/ +htmlcov/ + # Python and Node dependencies venv/ .venv/ .docker-venv/ node_modules/ +typings/ + +# Ignore dev lockfiles (should always be built fresh) +pdm.dev.lock +requirements-dev.txt # Packaging artifacts +requirements.txt +.pdm-python +.pdm-build archivebox.egg-info archivebox-*.tar.gz build/ dist/ # Data folders +lib/ +out/ +tmp/ data/ -data1/ -data2/ -data3/ +data*/ +archivebox/tests/data/ +archive/ output/ +logs/ +index.sqlite3 +queue.sqlite3 +*.sqlite* +data.* +.archivebox_id +ArchiveBox.conf +*.stdout +*.stderr +*.log +.tmp/ + +# vim +*.sw? +.vscode diff --git a/.gitmodules b/.gitmodules index 196c9a926f..e260fdf58b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,28 +1,3 @@ [submodule "docs"] path = docs url = https://github.com/ArchiveBox/ArchiveBox.wiki.git - -[submodule "deb_dist"] - path = deb_dist - url = https://github.com/ArchiveBox/debian-archivebox.git -[submodule "brew_dist"] - path = brew_dist - url = https://github.com/ArchiveBox/homebrew-archivebox.git -[submodule "pip_dist"] - path = pip_dist - url = https://github.com/ArchiveBox/pip-archivebox.git -[submodule "docker"] - path = docker - url = https://github.com/ArchiveBox/docker-archivebox.git -[submodule "archivebox/vendor/base32-crockford"] - path = archivebox/vendor/base32-crockford - url = https://github.com/jbittel/base32-crockford -[submodule "archivebox/vendor/pocket"] - path = archivebox/vendor/pocket - url = https://github.com/tapanpandita/pocket -[submodule "archivebox/vendor/django-taggit"] - path = archivebox/vendor/django-taggit - url = https://github.com/jazzband/django-taggit -[submodule "archivebox/vendor/python-atomicwrites"] - path = archivebox/vendor/python-atomicwrites - url = https://github.com/untitaker/python-atomicwrites diff --git a/.npmignore b/.npmignore deleted file mode 100644 index 53fae0a8da..0000000000 --- a/.npmignore +++ /dev/null @@ -1,19 +0,0 @@ -tests/ -archivebox/ -archivebox.egg-info/ -build/ -dist/ -docs/ -etc/ -.github -.gitmodules -.dockerignore -.flake8 -CNAME -_config.yml -docker-compose.yaml -docker-compose.yml -Dockerfile -MANIFEST.in -Pipfile -setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..2d7525348e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,67 @@ +default_language_version: + python: python3.13 + +repos: + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 + hooks: + - id: yesqa + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/asottile/pyupgrade + rev: v3.20.0 + hooks: + - id: pyupgrade + args: [--py313-plus] + + - repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma + + - repo: local + hooks: + - id: ruff-format + name: ruff-format + entry: uv run --active --no-sources ruff format + language: system + types_or: [python, pyi] + - id: ruff-check + name: ruff-check + entry: uv run --active --no-sources ruff check --fix + language: system + types_or: [python, pyi] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-ast + - id: check-toml + - id: check-yaml + exclude: ^\.github/workflows/homebrew\.yml$ + - id: check-json + - id: check-merge-conflict + - id: check-symlinks + - id: destroyed-symlinks + - id: check-case-conflict + - id: check-illegal-windows-names + - id: check-shebang-scripts-are-executable + exclude: ^(archivebox/.*\.py|archivebox/tests/.*\.py|archivebox/personas/export_browser_state\.js)$ + - id: mixed-line-ending + - id: fix-byte-order-marker + - id: end-of-file-fixer + - id: detect-private-key + - id: debug-statements + - id: forbid-submodules + exclude: ^docs$ + - id: check-added-large-files + args: ["--maxkb=600"] + - id: name-tests-test + args: ["--pytest-test-first"] + exclude: ^archivebox/tests/(data/|fixtures\.py$|migrations_helpers\.py$) diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..f923e3c1c3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,497 @@ +# Claude Code Development Guide for ArchiveBox + +## Quick Start + +```bash +# Set up dev environment (always use uv, never pip directly) +uv sync --dev --all-extras + +# Run tests as non-root user (required - ArchiveBox always refuses to run as root) +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/ -v' +``` + +## Development Environment Setup + +### Prerequisites +- Python 3.11+ (3.13 recommended) +- uv package manager +- A non-root user for running tests (e.g., `testuser`) + +### Install Dependencies +```bash +uv sync --dev --all-extras # Always use uv, never pip directly +``` + +### Activate Virtual Environment +```bash +source .venv/bin/activate +``` + +### Common Gotchas + +#### File Permissions +New files created by root need permissions fixed for testuser: +```bash +chmod 644 archivebox/tests/test_*.py +``` + +#### DATA_DIR Environment Variable +ArchiveBox commands must run inside a data directory. Tests use temp directories - the `run_archivebox()` helper sets `DATA_DIR` automatically. + +## Code Style Guidelines + +### Naming Conventions for Grep-ability +Use consistent naming for everything to enable easy grep-ability and logical grouping: + +**Principle**: Fewest unique names. If you must create a new unique name, make it grep and group well. + +**Examples**: +```python +# Filesystem migration methods - all start with fs_ +def fs_migration_needed() -> bool: ... +def fs_migrate() -> None: ... +def _fs_migrate_from_0_7_0_to_0_8_0() -> None: ... +def _fs_migrate_from_0_8_0_to_0_9_0() -> None: ... +def _fs_next_version(current: str) -> str: ... + +# Logging methods - ALL must start with log_ or _log +def log_migration_start(snapshot_id: str) -> None: ... +def _log_error(message: str) -> None: ... +def log_validation_result(ok: bool, msg: str) -> None: ... +``` + +**Rules**: +- Group related functions with common prefixes +- Use `_` prefix for internal/private helpers within the same family +- ALL logging-related methods MUST start with `log_` or `_log` +- Search for all migration functions: `grep -r "def.*fs_.*(" archivebox/` +- Search for all logging: `grep -r "def.*log_.*(" archivebox/` + +### Minimize Unique Names and Data Structures +**Do not invent new data structures, variable names, or keys if possible.** Try to use existing field names and data structures exactly to keep the total unique data structures and names in the codebase to an absolute minimum. + +**Example - GOOD**: +```python +# Binary has overrides field +binary = Binary(overrides={'TIMEOUT': '60s'}) + +# Binary reuses the same field name and structure +class Binary(models.Model): + overrides = models.JSONField(default=dict) # Same name, same structure +``` + +**Example - BAD**: +```python +# Don't invent new names like custom_bin_cmds, binary_overrides, etc. +class Binary(models.Model): + custom_bin_cmds = models.JSONField(default=dict) # ❌ New unique name +``` + +**Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load. + +## Testing + +### CRITICAL: Never Run as Root +ArchiveBox has a root check that prevents running as root user. All ArchiveBox commands (including tests) must run as non-root user inside a data directory: + +```bash +# Run all migration tests +sudo -u testuser bash -c 'source /path/to/.venv/bin/activate && python -m pytest archivebox/tests/test_migrations_*.py -v' + +# Run specific test file +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -v' + +# Run single test +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_fresh.py::TestFreshInstall::test_init_creates_database -xvs' +``` + +### Test File Structure +``` +archivebox/tests/ +├── test_migrations_helpers.py # Schemas, seeding functions, verification helpers +├── test_migrations_fresh.py # Fresh install tests +├── test_migrations_04_to_09.py # 0.4.x → 0.9.x migration tests +├── test_migrations_07_to_09.py # 0.7.x → 0.9.x migration tests +└── test_migrations_08_to_09.py # 0.8.x → 0.9.x migration tests +``` + +### Test Writing Standards + +#### NO MOCKS - Real Tests Only +Tests must exercise real code paths: +- Create real SQLite databases with version-specific schemas +- Seed with realistic test data +- Run actual `python -m archivebox` commands via subprocess +- Query SQLite directly to verify results + +**If something is hard to test**: Modify the implementation to make it easier to test, or fix the underlying issue. Never mock, skip, simulate, or exit early from a test because you can't get something working inside the test. + +#### NO SKIPS +Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. If a test is difficult, fix the code or test environment - don't disable the test. + +#### Strict Assertions +- `init` command must return exit code 0 (not `[0, 1]`) +- Verify ALL data is preserved, not just "at least one" +- Use exact counts (`==`) not loose bounds (`>=`) + +### Example Test Pattern +```python +def test_migration_preserves_snapshots(self): + """Migration should preserve all snapshots.""" + result = run_archivebox(self.work_dir, ['init'], timeout=45) + self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + + ok, msg = verify_snapshot_count(self.db_path, expected_count) + self.assertTrue(ok, msg) +``` + +### Testing Gotchas + +#### Extractors Disabled for Speed +Tests disable all extractors via environment variables for faster execution: +```python +env['SAVE_TITLE'] = 'False' +env['SAVE_FAVICON'] = 'False' +# ... etc +``` + +#### Timeout Settings +Use appropriate timeouts for migration tests (45s for init, 60s default). + +### Plugin Testing & Code Coverage + +**Target: 80-90% coverage** for critical plugins (screenshot, chrome, singlefile, dom) + +```bash +# Run plugin tests with coverage (both Python + JavaScript) +bash bin/test_plugins.sh screenshot + +# View coverage reports +bash bin/test_plugins.sh --coverage-report +# Or individual reports: +coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*' +``` + +#### Plugin Test Structure + +Tests are **completely isolated** from ArchiveBox - they replicate production directory structure in temp dirs: + +```python +# Correct production paths: +# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/ +# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/ + +with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + + # Crawl-level plugin (e.g., chrome launcher) + crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / 'crawl-123' + chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(parents=True) + + # Snapshot-level plugin (e.g., screenshot) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-456' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Run hook in its output directory + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com'], + cwd=str(screenshot_dir), + env={**get_test_env(), 'EXTRA_CONTEXT': '{"snapshot_id":"snap-456"}'}, + capture_output=True, + timeout=120 + ) +``` + +#### Coverage Improvement Loop + +To improve from ~20% to 80%+: + +1. **Run tests**: `bash bin/test_plugins.sh screenshot` → Shows: `19.1% (13/68 ranges)` +2. **Identify gaps**: Check hook file for untested paths (session connection vs fallback, config branches, error cases) +3. **Add tests**: Test both execution paths (connect to session + launch own browser), skip conditions, error cases, config variations +4. **Verify**: Re-run tests → Should show: `85%+ (58+/68 ranges)` + +**Critical**: JavaScript hooks have TWO paths that both must be tested (connect to session ~50% + launch browser ~30% + shared ~20%). Testing only one path = max 50% coverage possible! + +## Database Migrations + +### Generate and Apply Migrations +```bash +# Generate migrations (run from archivebox subdirectory) +cd archivebox +./manage.py makemigrations + +# Apply migrations to test database +cd data/ +archivebox init +``` + +### Schema Versions +- **0.4.x**: First Django version. Tags as comma-separated string, no ArchiveResult model +- **0.7.x**: Tag model with M2M, ArchiveResult model, AutoField PKs +- **0.8.x**: Crawl/Seed models, UUID PKs, status fields, depth/retry_at +- **0.9.x**: Seed model removed, seed_id FK removed from Crawl + +### Testing a Migration Path +1. Create SQLite DB with source version schema (from `test_migrations_helpers.py`) +2. Seed with realistic test data using `seed_0_X_data()` +3. Run `archivebox init` to trigger migrations +4. Verify data preservation with `verify_*` functions +5. Test CLI commands work post-migration (`status`, `list`, `add`, etc.) + +### Squashed Migrations +When testing 0.8.x (dev branch), you must record ALL replaced migrations: +```python +# The squashed migration replaces these - all must be recorded +('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'), +('core', '0024_auto_20240513_1143'), +# ... all 52 migrations from 0023-0074 ... +('core', '0023_new_schema'), # Also record the squashed migration itself +``` + +### Migration Strategy +- Squashed migrations for clean installs +- Individual migrations recorded for upgrades from dev branch +- `replaces` attribute in squashed migrations lists what they replace + +### Migration Gotchas + +#### Circular FK References in Schemas +SQLite handles circular references with `IF NOT EXISTS`. Order matters less than in other DBs. + +## Plugin System Architecture + +### Plugin Dependency Rules + +Like other plugins, chrome plugins **ARE NOT ALLOWED TO DEPEND ON ARCHIVEBOX OR DJANGO**. +However, they are allowed to depend on two shared files ONLY: +- `archivebox/plugins/chrome/chrome_utils.js` ← source of truth API for all basic chrome ops +- `archivebox/plugins/chrome/tests/chrome_test_utils.py` ← use for your tests, do not implement launching/killing/pid files/cdp/etc. in python, just extend this file as needed. + +### Chrome-Dependent Plugins + +Many plugins depend on Chrome/Chromium via CDP (Chrome DevTools Protocol). When checking for script name references or debugging Chrome-related issues, check these plugins: + +**Main puppeteer-based chrome installer + launcher plugin**: +- `chrome` - Core Chrome integration (CDP, launch, navigation) + +**Metadata extraction using chrome/chrome_utils.js / CDP**: +- `dns` - DNS resolution info +- `ssl` - SSL certificate info +- `headers` - HTTP response headers +- `redirects` - Capture redirect chains +- `staticfile` - Direct file downloads (e.g. if the url itself is a .png, .exe, .zip, etc.) +- `responses` - Capture network responses +- `consolelog` - Capture console.log output +- `title` - Extract page title +- `accessibility` - Extract accessibility tree +- `seo` - Extract SEO metadata + +**Extensions installed using chrome/chrome_utils.js / controlled using CDP**: +- `ublock` - uBlock Origin ad blocking +- `istilldontcareaboutcookies` - Cookie banner dismissal +- `twocaptcha` - 2captcha CAPTCHA solver integration + +**Page-alteration plugins to prepare the content for archiving**: +- `modalcloser` - Modal dialog dismissal +- `infiniscroll` - Infinite scroll handler + +**Main Extractor Outputs**: +- `dom` - DOM snapshot extraction +- `pdf` - Generate PDF snapshots +- `screenshot` - Generate screenshots +- `singlefile` - SingleFile archival, can be single-file-cli that launches chrome, or singlefile extension running inside chrome + +**Crawl URL parsers** (post-process dom.html, singlefile.html, staticfile, responses, headers, etc. for URLs to re-emit as new queued Snapshots during recursive crawling): +- `parse_dom_outlinks` - Extract outlinks from DOM (special, uses CDP to directly query browser) +- `parse_html_urls` - Parse URLs from HTML (doesn't use chrome directly, just reads dom.html) +- `parse_jsonl_urls` - Parse URLs from JSONL (doesn't use chrome directly, just reads dom.html) +- `parse_netscape_urls` - Parse Netscape bookmark format (doesn't use chrome directly, just reads dom.html) + +### Finding Chrome-Dependent Plugins + +```bash +# Find all files containing "chrom" (case-insensitive) +grep -ri "chrom" archivebox/plugins/*/on_*.* --include="*.*" 2>/dev/null | cut -d: -f1 | sort -u + +# Or get just the plugin names +grep -ri "chrom" archivebox/plugins/*/on_*.* --include="*.*" 2>/dev/null | cut -d/ -f3 | sort -u +``` + +**Note**: This list may not be complete. Always run the grep command above when checking for Chrome-related script references or debugging Chrome integration issues. + +## Architecture Notes + +### Crawl Model (0.9.x) +- Crawl groups multiple Snapshots from a single `add` command +- Each `add` creates one Crawl with one or more Snapshots +- Seed model was removed - crawls now store URLs directly + +## Code Coverage + +### Overview + +Coverage tracking is enabled for passive collection across all contexts: +- Unit tests (pytest) +- Integration tests +- Dev server (manual testing) +- CLI usage + +Coverage data accumulates in `.coverage` file and can be viewed/analyzed to find dead code. + +### Install Coverage Tools + +```bash +uv sync --dev # Installs pytest-cov and coverage +``` + +### Running with Coverage + +#### Unit Tests +```bash +# Run tests with coverage +pytest --cov=archivebox --cov-report=term archivebox/tests/ + +# Or run specific test file +pytest --cov=archivebox --cov-report=term archivebox/tests/test_migrations_08_to_09.py +``` + +#### Dev Server with Coverage +```bash +# Start dev server with coverage tracking +coverage run --parallel-mode -m archivebox server + +# Or CLI commands +coverage run --parallel-mode -m archivebox init +coverage run --parallel-mode -m archivebox add https://example.com +``` + +#### Manual Testing (Always-On) +To enable coverage during ALL Python executions (passive tracking): + +```bash +# Option 1: Use coverage run wrapper +coverage run --parallel-mode -m archivebox [command] + +# Option 2: Set environment variable (tracks everything) +export COVERAGE_PROCESS_START=pyproject.toml +# Now all Python processes will track coverage +archivebox server +archivebox add https://example.com +``` + +### Viewing Coverage + +#### Text Report (Quick View) +```bash +# Combine all parallel coverage data +coverage combine + +# View summary +coverage report + +# View detailed report with missing lines +coverage report --show-missing + +# View specific file +coverage report --include="archivebox/core/models.py" --show-missing +``` + +#### JSON Report (LLM-Friendly) +```bash +# Generate JSON report +coverage json + +# View the JSON +cat coverage.json | jq '.files | keys' # List all files + +# Find files with low coverage +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered < 50) | "\(.key): \(.value.summary.percent_covered)%"' + +# Find completely uncovered files (dead code candidates) +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered == 0) | .key' + +# Get missing lines for a specific file +cat coverage.json | jq '.files["archivebox/core/models.py"].missing_lines' +``` + +#### HTML Report (Visual) +```bash +# Generate interactive HTML report +coverage html + +# Open in browser +open htmlcov/index.html +``` + +### Isolated Runs + +To measure coverage for specific scenarios: + +```bash +# 1. Reset coverage data +coverage erase + +# 2. Run your isolated test/scenario +pytest --cov=archivebox archivebox/tests/test_migrations_fresh.py +# OR +coverage run --parallel-mode -m archivebox add https://example.com + +# 3. View results +coverage combine +coverage report --show-missing + +# 4. Optionally export for analysis +coverage json +``` + +### Finding Dead Code + +```bash +# 1. Run comprehensive tests + manual testing to build coverage +pytest --cov=archivebox archivebox/tests/ +coverage run --parallel-mode -m archivebox server # Use the app manually +coverage combine + +# 2. Find files with 0% coverage (strong dead code candidates) +coverage json +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered == 0) | .key' + +# 3. Find files with <10% coverage (likely dead code) +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered < 10) | "\(.key): \(.value.summary.percent_covered)%"' | sort -t: -k2 -n + +# 4. Generate detailed report for analysis +coverage report --show-missing > coverage_report.txt +``` + +### Tips + +- **Parallel mode** (`--parallel-mode`): Allows multiple processes to track coverage simultaneously without conflicts +- **Combine**: Always run `coverage combine` before viewing reports to merge parallel data +- **Reset**: Use `coverage erase` to start fresh for isolated measurements +- **Branch coverage**: Enabled by default - tracks if both branches of if/else are executed +- **Exclude patterns**: Config in `pyproject.toml` excludes tests, migrations, type stubs + +## Debugging Tips + +### Check Migration State +```bash +sqlite3 /path/to/index.sqlite3 "SELECT app, name FROM django_migrations WHERE app='core' ORDER BY id;" +``` + +### Check Table Schema +```bash +sqlite3 /path/to/index.sqlite3 "PRAGMA table_info(core_snapshot);" +``` + +### Verbose Test Output +```bash +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -xvs 2>&1 | head -200' +``` + +### Kill Zombie Chrome Processes +```bash +./bin/kill_chrome.sh +``` diff --git a/CNAME b/CNAME deleted file mode 100644 index 4ff42236ef..0000000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -archivebox.io \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 81e5f196f8..e483bcb8b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,125 +1,396 @@ -# This is the Dockerfile for ArchiveBox, it bundles the following dependencies: -# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, single-file +# This is the Dockerfile for ArchiveBox, it bundles the following main dependencies: +# python3.13, uv, python3-ldap +# curl, wget, git, dig, ping, tree, nano +# node, npm, single-file, readability-extractor, postlight-parser +# ArchiveBox, yt-dlp, playwright, chromium # Usage: -# docker build . -t archivebox --no-cache +# git clone https://github.com/ArchiveBox/ArchiveBox && cd ArchiveBox +# docker build . -t archivebox # docker run -v "$PWD/data":/data archivebox init # docker run -v "$PWD/data":/data archivebox add 'https://example.com' # docker run -v "$PWD/data":/data -it archivebox manage createsuperuser # docker run -v "$PWD/data":/data -p 8000:8000 archivebox server +# Multi-arch build: +# docker buildx create --use +# docker buildx build . --platform=linux/amd64,linux/arm64 --push -t archivebox/archivebox:dev -t archivebox/archivebox:sha-abc123 +# Read more here: https://github.com/ArchiveBox/ArchiveBox#archivebox-development -FROM python:3.9-slim-buster + +######################################################################################### + +### Example: Using ArchiveBox in your own project's Dockerfile ######## + +# FROM python:3.13-slim +# WORKDIR /data +# RUN pip install archivebox>=0.9.0 # use latest release here +# RUN archivebox install +# RUN useradd -ms /bin/bash archivebox && chown -R archivebox /data + +######################################################################################### + +FROM ubuntu:24.04 LABEL name="archivebox" \ - maintainer="Nick Sweeting " \ - description="All-in-one personal internet archiving container" \ + maintainer="Nick Sweeting " \ + description="All-in-one self-hosted internet archiving solution" \ homepage="https://github.com/ArchiveBox/ArchiveBox" \ - documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker" + documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \ + org.opencontainers.image.title="ArchiveBox" \ + org.opencontainers.image.vendor="ArchiveBox" \ + org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \ + org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \ + com.docker.image.source.entrypoint="Dockerfile" \ + # TODO: release ArchiveBox as a Docker Desktop extension (requires these labels): + # https://docs.docker.com/desktop/extensions-sdk/architecture/metadata/ + com.docker.desktop.extension.api.version=">= 1.4.7" \ + com.docker.desktop.extension.icon="https://archivebox.io/icon.png" \ + com.docker.extension.publisher-url="https://archivebox.io" \ + com.docker.extension.screenshots='[{"alt": "Screenshot of Admin UI", "url": "https://github.com/ArchiveBox/ArchiveBox/assets/511499/e8e0b6f8-8fdf-4b7f-8124-c10d8699bdb2"}]' \ + com.docker.extension.detailed-description='See here for detailed documentation: https://wiki.archivebox.io' \ + com.docker.extension.changelog='See here for release notes: https://github.com/ArchiveBox/ArchiveBox/releases' \ + com.docker.extension.categories='database,utility-tools' + +ARG TARGETPLATFORM +ARG TARGETOS +ARG TARGETARCH +ARG TARGETVARIANT +######### Environment Variables ################################# -# System-level base config +# Global build-time and runtime environment constants + default pkg manager config ENV TZ=UTC \ LANGUAGE=en_US:en \ LC_ALL=C.UTF-8 \ LANG=C.UTF-8 \ + DEBIAN_FRONTEND=noninteractive \ + APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ PYTHONIOENCODING=UTF-8 \ PYTHONUNBUFFERED=1 \ - DEBIAN_FRONTEND=noninteractive \ - APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + npm_config_loglevel=error -# Application-level base config +# Language Version config +ENV PYTHON_VERSION=3.13 \ + NODE_VERSION=22 + +# Non-root User config +ENV ARCHIVEBOX_USER="archivebox" \ + DEFAULT_PUID=911 \ + DEFAULT_PGID=911 \ + IN_DOCKER=True + +# ArchiveBox Source Code + Lib + Data paths ENV CODE_DIR=/app \ - VENV_PATH=/venv \ DATA_DIR=/data \ - NODE_DIR=/node \ - ARCHIVEBOX_USER="archivebox" + PLAYWRIGHT_BROWSERS_PATH=/browsers + +# Bash SHELL config +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"] + +######### System Environment #################################### + +# Detect ArchiveBox version number by reading pyproject.toml (also serves to invalidate the entire build cache whenever pyproject.toml changes) +WORKDIR "$CODE_DIR" + +# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up back-to-back Docker builds) +RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \ + && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-intall-recommends \ + && echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-intall-suggests \ + && rm -f /etc/apt/apt.conf.d/docker-clean + +# Print debug info about build and save it to disk, for human eyes only, not used by anything else +RUN (echo "[i] Docker build for ArchiveBox starting..." \ + && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \ + && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \ + && echo \ + && echo "PYTHON=${PYTHON_VERSION} NODE=${NODE_VERSION} PATH=${PATH}" \ + && echo "CODE_DIR=${CODE_DIR} DATA_DIR=${DATA_DIR}" \ + && echo \ + && uname -a \ + && cat /etc/os-release | head -n7 \ + && which bash && bash --version | head -n1 \ + && which dpkg && dpkg --version | head -n1 \ + && echo -e '\n\n' && env && echo -e '\n\n' \ + ) | tee -a /VERSION.txt # Create non-privileged user for archivebox and chrome -RUN groupadd --system $ARCHIVEBOX_USER \ - && useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER +RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \ + && groupadd --system $ARCHIVEBOX_USER \ + && useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \ + && usermod -u "$DEFAULT_PUID" "$ARCHIVEBOX_USER" \ + && groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \ + && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER PUID=$(id -u $ARCHIVEBOX_USER) PGID=$(id -g $ARCHIVEBOX_USER)\n\n" \ + | tee -a /VERSION.txt + # DEFAULT_PUID and DEFAULT_PID are overridden by PUID and PGID in /bin/docker_entrypoint.sh at runtime + # https://docs.linuxserver.io/general/understanding-puid-and-pgid -# Install system dependencies -RUN apt-get update -qq \ - && apt-get install -qq -y --no-install-recommends \ - apt-transport-https ca-certificates gnupg2 zlib1g-dev \ - dumb-init gosu cron unzip curl \ +# Install system apt dependencies (adding backports to access more recent apt updates) +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing base system dependencies for $TARGETPLATFORM..." \ + && mkdir -p /etc/apt/keyrings \ + && apt-get update -qq \ + && apt-get install -qq -y \ + # 1. packaging dependencies + apt-transport-https ca-certificates apt-utils gnupg2 curl wget \ + # 2. docker and init system dependencies + zlib1g-dev dumb-init gosu cron unzip grep dnsutils \ + # 3. frivolous CLI helpers to make debugging failed archiving easier + tree nano iputils-ping \ + # nano iputils-ping dnsutils htop procps jq yq && rm -rf /var/lib/apt/lists/* -# Install apt dependencies -RUN apt-get update -qq \ +# Install apt binary dependencies for extractors +# COPY --from=selenium/ffmpeg:latest /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing extractor dependencies for $TARGETPLATFORM..." \ + && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - wget curl chromium git ffmpeg youtube-dl ripgrep \ - fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ - && rm -rf /var/lib/apt/lists/* + git ripgrep \ + # Packages we have also needed in the past: + # youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \ + # curl wget (already installed above) + && rm -rf /var/lib/apt/lists/* \ + # Save version info + && ( \ + which curl && curl --version | head -n1 \ + && which wget && wget --version 2>&1 | head -n1 \ + && which git && git --version 2>&1 | head -n1 \ + # && which ffmpeg && (ffmpeg --version 2>&1 | head -n1) || true \ + && which rg && rg --version 2>&1 | head -n1 \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + +# Install sonic search backend +COPY --from=archivebox/sonic:1.4.9 /usr/local/bin/sonic /usr/local/bin/sonic +COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg +RUN (which sonic && sonic --version) | tee -a /VERSION.txt + +######### Language Environments #################################### -# Install Node environment -RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \ +# Set up Python environment +# NOT NEEDED because we're using a pre-built python image, keeping this here in case we switch back to custom-building our own: +#RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ +# --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \ +# RUN echo "[+] APT Installing PYTHON $PYTHON_VERSION for $TARGETPLATFORM (skipped, provided by base image)..." \ + # && apt-get update -qq \ + # && apt-get install -qq -y --no-upgrade \ + # python${PYTHON_VERSION} python${PYTHON_VERSION}-minimal python3-pip python${PYTHON_VERSION}-venv pipx \ + # && rm -rf /var/lib/apt/lists/* \ + # tell PDM to allow using global system python site packages + # && rm /usr/lib/python3*/EXTERNALLY-MANAGED \ + # && ln -s "$(which python${PYTHON_VERSION})" /usr/bin/python \ + # create global virtual environment GLOBAL_VENV to use (better than using pip install --global) + # && python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \ + # && python3 -m venv --system-site-packages $GLOBAL_VENV \ + # && python3 -m venv $GLOBAL_VENV \ + # install global dependencies / python build dependencies in GLOBAL_VENV + # && pip install --upgrade pip setuptools wheel \ + # Save version info + # && ( \ + # which python3 && python3 --version | grep " $PYTHON_VERSION" \ + # && which pip && pip --version \ + # # && which pdm && pdm --version \ + # && echo -e '\n\n' \ + # ) | tee -a /VERSION.txt + + +# Set up Node environment +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing NODE $NODE_VERSION for $TARGETPLATFORM..." \ + && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \ + && curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ && apt-get update -qq \ - && apt-get install -qq -y --no-install-recommends \ + && apt-get install -qq -y --no-upgrade libatomic1 \ + && apt-get install -y --no-upgrade \ nodejs \ - # && npm install -g npm \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* \ + # Update NPM to latest version + && npm i -g npm --cache /root/.npm \ + # Save version info + && ( \ + which node && node --version \ + && which npm && npm --version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt -# Install Node dependencies -WORKDIR "$NODE_DIR" -ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \ - npm_config_loglevel=error -ADD ./package.json ./package.json -ADD ./package-lock.json ./package-lock.json -RUN npm ci -# Install Python dependencies +# Set up uv and main app /venv +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh +ENV UV_COMPILE_BYTECODE=1 \ + UV_PYTHON_PREFERENCE=managed \ + UV_PYTHON_INSTALL_DIR=/opt/uv/python \ + UV_LINK_MODE=copy \ + UV_PROJECT_ENVIRONMENT=/venv WORKDIR "$CODE_DIR" -ENV PATH="${PATH}:$VENV_PATH/bin" -RUN python -m venv --clear --symlinks "$VENV_PATH" \ - && pip install --upgrade --quiet pip setuptools -ADD "./setup.py" "$CODE_DIR/" -ADD "./package.json" "$CODE_DIR/archivebox/" -RUN apt-get update -qq \ +# COPY --chown=root:root --chmod=755 pyproject.toml "$CODE_DIR/" +RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[+] UV Creating /venv using python ${PYTHON_VERSION} for ${TARGETPLATFORM}..." \ + && uv venv /venv --python ${PYTHON_VERSION} +ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH" +RUN uv pip install setuptools pip \ + && ( \ + which python3 && python3 --version \ + && which uv && uv self version \ + && uv python find --system && uv python find \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + + +######### ArchiveBox & Extractor Dependencies ################################## + +# Install ArchiveBox C-compiled/apt-installed Python dependencies in app /venv (currently only used for python-ldap) +WORKDIR "$CODE_DIR" +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + #--mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing + Compiling python3-ldap for PIP archivebox[ldap] on ${TARGETPLATFORM}..." \ + && apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - build-essential python-dev python3-dev \ - && echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \ - && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ - && pip install --quiet -r /tmp/requirements.txt \ - && apt-get purge -y build-essential python-dev python3-dev \ + build-essential gcc \ + python3-dev libssl-dev libldap2-dev libsasl2-dev python3-ldap \ + python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \ + && uv pip install \ + "python-ldap>=3.4.3" \ + && apt-get purge -y \ + python3-dev build-essential gcc \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* -# Install apt development dependencies -# RUN apt-get install -qq \ -# && apt-get install -qq -y --no-install-recommends \ -# python3 python3-dev python3-pip python3-venv python3-all \ -# dh-python debhelper devscripts dput software-properties-common \ -# python3-distutils python3-setuptools python3-wheel python3-stdeb -# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \ - # && pip install --quiet -r /tmp/dev_requirements.txt -# Install ArchiveBox Python package and its dependencies +# Install apt font & rendering dependencies for chromium browser +# TODO: figure out how much of this overlaps with `playwright install-deps chromium` +# RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + +# Install chromium browser binary using playwright +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing CHROMIUM dependencies, fonts, and display libraries for $TARGETPLATFORM..." \ + && apt-get update -qq \ + && apt-get install -qq -y \ + #fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-khmeros fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ + #at-spi2-common fonts-liberation fonts-noto-color-emoji fonts-tlwg-loma-otf fonts-unifont libatk-bridge2.0-0 libatk1.0-0 libatspi2.0-0 libavahi-client3 \ + #libavahi-common-data libavahi-common3 libcups2 libfontenc1 libice6 libnspr4 libnss3 libsm6 libunwind8 \ + #libxaw7 libxcomposite1 libxdamage1 libxfont2 \ + libxkbfile1 libxmu6 libxpm4 libxt6 x11-xkb-utils x11-utils xfonts-encodings \ + # xfonts-scalable xfonts-utils xserver-common xvfb \ + # chrome can run without dbus/upower technically, it complains about missing dbus but should run ok anyway + # libxss1 dbus dbus-x11 upower \ + # && service dbus start \ + && echo "[+] PIP Installing playwright into /venv and CHROMIUM binary into $PLAYWRIGHT_BROWSERS_PATH..." \ + && uv pip install "playwright>=1.49.1" \ + && uv run playwright install chromium --no-shell --with-deps \ + && export CHROME_BINARY="$(uv run python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \ + && ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \ + && ln -s /browsers/ffmpeg-*/ffmpeg-linux /usr/bin/ffmpeg \ + && mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \ + && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/home/${ARCHIVEBOX_USER}/.config" \ + && mkdir -p "$PLAYWRIGHT_BROWSERS_PATH" \ + && chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \ + # delete extra full copy of node that playwright installs (saves >100mb) + && rm -f /venv/lib/python$PYTHON_VERSION/site-packages/playwright/driver/node \ + # Save version info + && rm -rf /var/lib/apt/lists/* \ + && ( \ + uv pip show playwright \ + && which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \ + && which ffmpeg && ffmpeg -version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + +# Install Node extractor dependencies +ENV PATH="/home/$ARCHIVEBOX_USER/.npm/bin:$PATH" +USER $ARCHIVEBOX_USER +WORKDIR "/home/$ARCHIVEBOX_USER/.npm" +RUN --mount=type=cache,target=/home/archivebox/.npm_cache,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT,uid=$DEFAULT_PUID,gid=$DEFAULT_PGID \ + echo "[+] NPM Installing node extractor dependencies into /home/$ARCHIVEBOX_USER/.npm..." \ + && npm config set prefix "/home/$ARCHIVEBOX_USER/.npm" \ + && npm install --global --prefer-offline --no-fund --no-audit --cache "/home/$ARCHIVEBOX_USER/.npm_cache" \ + "@postlight/parser@^2.2.3" \ + "readability-extractor@github:ArchiveBox/readability-extractor" \ + "single-file-cli@^1.1.54" \ + "puppeteer@^23.5.0" \ + "@puppeteer/browsers@^2.4.0" \ + && rm -Rf "/home/$ARCHIVEBOX_USER/.cache/puppeteer" +USER root WORKDIR "$CODE_DIR" -ADD . "$CODE_DIR" -RUN pip install -e . +RUN ( \ + which node && node --version \ + && which npm && npm version \ + && which postlight-parser \ + && which readability-extractor && readability-extractor --version \ + && which single-file && single-file --version \ + && which puppeteer && puppeteer --version \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + +######### Build Dependencies #################################### + + +# Install ArchiveBox Python venv dependencies from uv.lock +RUN --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ + --mount=type=bind,source=uv.lock,target=/app/uv.lock \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[+] PIP Installing ArchiveBox dependencies from pyproject.toml and uv.lock..." \ + && uv sync \ + --frozen \ + --inexact \ + --all-extras \ + --no-install-project \ + --no-install-workspace + # installs the pip packages that archivebox depends on, defined in pyproject.toml and uv.lock dependencies + +# Install ArchiveBox Python package + workspace dependencies from source +COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" +RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \ + && pip install \ + --no-deps \ + "$CODE_DIR" \ + && ( \ + pip show archivebox \ + && which archivebox \ + && echo -e '\n\n' \ + ) | tee -a /VERSION.txt + # installs archivebox itself, and any other vendored packages in pkgs/*, defined in pyproject.toml workspaces + +#################################################### # Setup ArchiveBox runtime config +ENV TMP_DIR=/tmp/archivebox \ + LIB_DIR=/usr/share/archivebox/lib \ + GOOGLE_API_KEY=no \ + GOOGLE_DEFAULT_CLIENT_ID=no \ + GOOGLE_DEFAULT_CLIENT_SECRET=no + WORKDIR "$DATA_DIR" -ENV IN_DOCKER=True \ - CHROME_SANDBOX=False \ - CHROME_BINARY="chromium" \ - USE_SINGLEFILE=True \ - SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \ - USE_READABILITY=True \ - READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \ - USE_MERCURY=True \ - MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" +RUN openssl rand -hex 16 > /etc/machine-id \ + && mkdir -p "$TMP_DIR" \ + && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$TMP_DIR" \ + && mkdir -p "$LIB_DIR" \ + && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "$LIB_DIR" \ + && echo -e "\nTMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt # Print version for nice docker finish summary -# RUN archivebox version -RUN /app/bin/docker_entrypoint.sh archivebox version +RUN (echo -e "\n\n[√] Finished Docker build successfully. Saving build summary in: /VERSION.txt" \ + && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \ + && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \ + ) | tee -a /VERSION.txt + +# Verify ArchiveBox is installed and print version info +RUN chmod +x "$CODE_DIR"/bin/*.sh \ + && gosu "$DEFAULT_PUID" archivebox version 2>&1 | tee -a /VERSION.txt || true -# Open up the interfaces to the outside world +#################################################### + +# Expose ArchiveBox's main interfaces to the outside world +WORKDIR "$DATA_DIR" VOLUME "$DATA_DIR" EXPOSE 8000 HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ - CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1 + CMD curl --silent 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK' ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] -CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"] +CMD ["archivebox", "server", "--init", "0.0.0.0:8000"] diff --git a/LICENSE b/LICENSE index ea201f9f9d..4261347ae3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Nick Sweeting +Copyright (c) 2024 Nick Sweeting Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index f33f160f9d..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,6 +0,0 @@ -graft archivebox -global-exclude .DS_Store -global-exclude __pycache__ -global-exclude *.pyc - -prune tests/ diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 78cec54d32..0000000000 --- a/Pipfile +++ /dev/null @@ -1,12 +0,0 @@ -[[source]] -name = "pypi" -url = "https://pypi.org/simple" -verify_ssl = true - -[packages] -# see setup.py for package dependency list -"e1839a8" = {path = ".", editable = true} - -[dev-packages] -# see setup.py for dev package dependency list -"e1839a8" = {path = ".", extras = ["dev"], editable = true} diff --git a/README.md b/README.md index e6d235977a..8aa1d6ac2d 100644 --- a/README.md +++ b/README.md @@ -1,331 +1,636 @@ -
- +
+

ArchiveBox
Open-source self-hosted web archiving.

-â–ļī¸ Quickstart | -Demo | -Github | -Documentation | -Info & Motivation | -Community | -Roadmap +
+ +â–ļī¸ Quickstart | Demo | GitHub | Documentation | Info & Motivation | Community -
-"Your own personal internet archive" (įŊ‘įĢ™å­˜æĄŖ / įˆŦč™Ģ)
-
+
- - - - -
- -Language grade: Python -Language grade: JavaScript -Total alerts +     + -
+
+
-**ArchiveBox is a powerful, self-hosted internet archiving solution to collect, save, and view sites you want to preserve offline.** +**ArchiveBox is a self-hosted app that lets you preserve content from websites in a variety of formats.** -You can set it up as a [command-line tool](#Quickstart), [web app](#Quickstart), and [desktop app](https://github.com/ArchiveBox/electron-archivebox) (alpha), on Linux, macOS, and Windows. +We aim to make your data immediately useful, and kept in formats that other programs can read directly. As output, we save standard HTML, PNG, PDF, TXT, JSON, WARC, SQLite, all guaranteed to be readable for decades to come. ArchiveBox also has a CLI, REST API, and webhooks so you can set up integrations with other services. -**You can feed it URLs one at a time, or schedule regular imports** from browser bookmarks or history, feeds like RSS, bookmark services like Pocket/Pinboard, and more. See input formats for a full list. +Without active preservation effort, everything on the internet eventually disappears or degrades. -**It saves snapshots of the URLs you feed it in several formats:** HTML, PDF, PNG screenshots, WARC, and more out-of-the-box, with a wide variety of content extracted and preserved automatically (article text, audio/video, git repos, etc.). See output formats for a full list. +*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* +
-The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessable formats for decades after it goes down. +> âžĄī¸ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart)/[macOS](#quickstart), or via **[Docker](#quickstart)** â­ī¸ on any OS. + +*Once installed, you can interact with it through the: [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), [CLI](#usage), [self-hosted web interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python API](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [filesystem](#static-archive-exporting).* -
-

-bookshelf graphic   logo   bookshelf graphic -

-Demo | Screenshots | Usage
-. . . . . . . . . . . . . . . . . . . . . . . . . . . . -

-
+
+
+ +đŸ“Ĩ **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), and more. +See Input Formats for a full list of supported input formats...
-**đŸ“Ļ  Install ArchiveBox with [Docker Compose (recommended)](#Quickstart) / Docker, or `apt` / `brew` / `pip` ([see below](#Quickstart)).** +snapshot detail page -*No matter which setup method you choose, they all follow this basic process and provide the same CLI, Web UI, and on-disk data layout.* +**It saves snapshots of the URLs you feed it in several redundant formats.** +It also detects any content featured *inside* pages & extracts it out into a folder: +- 🌐 **HTML**/**Any websites** âžĄī¸ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `title`, `article text`, `favicon`, `headers`, ... +- đŸŽĨ **Social Media**/**News** âžĄī¸ `post content TXT`, `comments`, `title`, `author`, `images`, ... +- đŸŽŦ **YouTube**/**SoundCloud**/etc. âžĄī¸ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... +- 💾 **Github**/**Gitlab**/etc. links âžĄī¸ `clone of GIT source code`, `README`, `images`, ... +- ✨ *and more, see [Output Formats](#output-formats) below...* -1. Once you've installed ArchiveBox, run this in a new empty folder to get started -```bash -archivebox init --setup # creates a new collection in the current directory -``` +You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. +All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more... -2. Add some URLs you want to archive -```bash -archivebox add 'https://example.com' # add URLs one at a time via args / piped stdin -archivebox schedule --every=day --depth=1 https://example.com/rss.xml # or have it import URLs on a schedule -``` +
+
-3. Then view your archived pages -```bash -archivebox server 0.0.0.0:8000 # use the interactive web UI -archivebox list 'https://example.com' # use the CLI commands (--help for more) -ls ./archive/*/index.json # or browse directly via the filesystem -``` +đŸ› ī¸ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout). +*(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* + +The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. + + +
+
-**â¤ĩī¸ See the [Quickstart](#Quickstart) below for more...** -
+**đŸ“Ļ  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart below](#quickstart)).** + + +
Expand for quick copy-pastable install commands...   â¤ĩī¸ +
+
# Option A: Get ArchiveBox with Docker Compose (recommended):
+mkdir -p ~/archivebox/data && cd ~/archivebox
+curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
+docker compose run archivebox init --install
+# docker compose run archivebox add 'https://example.com'
+# docker compose run archivebox help
+# docker compose up
+
+
+# Option B: Or use it as a plain Docker container: +mkdir -p ~/archivebox/data && cd ~/archivebox/data +docker run -it -v $PWD:/data archivebox/archivebox init --install +# docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' +# docker run -it -v $PWD:/data archivebox/archivebox help +# docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox +
+
+# Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) +pip install archivebox +mkdir -p ~/archivebox/data && cd ~/archivebox/data +archivebox init --install +# archivebox add 'https://example.com' +# archivebox help +# archivebox server 0.0.0.0:8000 +
+
+# Option D: Or use the optional auto setup script to install it +curl -fsSL 'https://get.archivebox.io' | bash +
+
+Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI âžĄī¸
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically. +
+
+ + +


-cli init screenshot -cli init screenshot -server snapshot admin screenshot -server snapshot details page screenshot +bookshelf graphic   logo   bookshelf graphic +

+Demo | Screenshots | Usage +
+. . . . . . . . . . . . . . . . . . . . . . . . . . . . +

+cli init screenshot +cli init screenshot +server snapshot admin screenshot +server snapshot details page screenshot

## Key Features -- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally -- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies) +- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), own your own data & maintain your privacy by self-hosting +- [**Powerful CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular dependencies](#dependencies) and [support for Google Drive/NFS/SMB/S3/B2/etc.](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage) - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) -- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl), articles (readability), code (git), etc.](#output-formats) +- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) -- [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, and WARC -- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA) -- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) -- Planned: support for archiving [content requiring a login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (working, but ill-advised until some pending fixes are released) -- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345)... +- [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC +- [**Powerful CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) +- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) +- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) +- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345) -

+
+ +## 🤝 Professional Integration + +ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): + +- **Journalists:** + `crawling during research`, `preserving cited pages`, `fact-checking & review` +- **Lawyers:** + `collecting & preserving evidence`, `detecting changes`, `tagging & review` +- **Researchers:** + `analyzing social media trends`, `getting LLM training data`, `crawling pipelines` +- **Individuals:** + `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` +- **Governments:** + `snapshotting public service sites`, `recordkeeping compliance` + +> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* +> We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more +> *ArchiveBox is a đŸ›ī¸ 501(c)(3) [nonprofit FSP](https://hackclub.com/hcb/) and all our work supports open-source development.* + +
-
+

-grassgrass +grassgrass
-# Quickstart + -**đŸ–Ĩ  Supported OSs:** Linux/BSD, macOS, Windows (Docker/WSL)   **👾  CPUs:** amd64, x86, arm8, arm7 (raspi>=3) +# Quickstart +**đŸ–Ĩ  [Supported OSs](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#supported-systems):** Linux/BSD, macOS, Windows (Docker)   **👾  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)
-#### âŦ‡ī¸  Initial Setup +
-*(click to expand your preferred **â–ē `distribution`** below for full setup instructions)* +#### âœŗī¸  Easy Setup
-Get ArchiveBox with docker-compose on macOS/Linux/Windows ✨ (highly recommended) +Docker docker-compose (macOS/Linux/Windows)   👈  recommended   (click to expand) +
+👍 Docker Compose is recommended for the easiest install/update UX + best security + all extras out-of-the-box. +

+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Download the docker-compose.yml file into a new empty directory (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox
    +# Read and edit docker-compose.yml options as-needed after downloading
    +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
    +
  4. +
  5. Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml) +
    docker compose run archivebox init --install
    +
  6. +
  7. Next steps: Start the server then login to the Web UI http://127.0.0.1:8000 â‡ĸ Admin. +
    docker compose up
    +# completely optional, CLI can always be used without running a server
    +# docker compose run [-T] archivebox [subcommand] [--help]
    +docker compose run archivebox add 'https://example.com'
    +docker compose run archivebox help
    +
    +For more info, see Install: Docker Compose in the Wiki. âžĄī¸ +
  8. +
-First make sure you have Docker installed: https://docs.docker.com/get-docker/ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
-Download the [`docker-compose.yml`](https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml) file. -

-curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
+
+Docker docker run (macOS/Linux/Windows) +
+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +docker run -v $PWD:/data -it archivebox/archivebox init --install
    +
    +
  4. +
  5. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 â‡ĸ Admin. +
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
    +# completely optional, CLI can always be used without running a server
    +# docker run -v $PWD:/data -it [subcommand] [--help]
    +docker run -v $PWD:/data -it archivebox/archivebox help
     
    +For more info, see Install: Docker Compose in the Wiki. âžĄī¸ +
  6. +
-Start the server. -

-docker-compose run archivebox init --setup
-docker-compose up
+See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+

+
+ +
+curl sh automatic setup script bash auto-setup script (macOS/Linux) +
+
    +
  1. Install Docker on your system (optional, highly recommended but not required).
  2. +
  3. Run the automatic setup script. +
    curl -fsSL 'https://get.archivebox.io' | bash
    +For more info, see Install: Bare Metal in the Wiki. âžĄī¸ +
  4. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See setup.sh for the source code of the auto-install script.
+See "Against curl | sh as an install method" blog post for my thoughts on the shortcomings of this install method. +

+
+ +
+ +#### 🛠  Package Manager Setup + + + + +
+Pip pip (macOS/Linux/BSD) +
+
    + +
  1. Install Python >= v3.13 and Node >= v22 on your system (if not already installed).
  2. +
  3. Install the ArchiveBox package using pip3 (or uvx). +
    pip3 install --upgrade archivebox
    +archivebox version
    +# install any missing extras shown using apt/brew/pkg/etc. see Wiki for instructions
    +#    python@3.13 node curl wget git ripgrep ...
    +
    +See the Install: Bare Metal Wiki for full install instructions for each OS... +
  4. +
  5. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
    +archivebox init --install   # instantialize a new collection
    +# (--setup auto-installs and link JS dependencies: singlefile, readability, mercury, etc.)
     
    +
  6. +
  7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 â‡ĸ Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +
    +
  8. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+
+See the pip-archivebox repo for more details about this distribution. +

+
-Open [`http://127.0.0.1:8000`](http://127.0.0.1:8000). -

-# you can also add links and manage your archive via the CLI:
-docker-compose run archivebox add 'https://example.com'
-echo 'https://example.com' | docker-compose run archivebox -T add
-docker-compose run archivebox status
-docker-compose run archivebox help  # to see more options
-
-# when passing stdin/stdout via the cli, use the -T flag
-echo 'https://example.com' | docker-compose run -T archivebox add
-docker-compose run -T archivebox list --html --with-headers > index.html
+
+aptitude apt (Ubuntu/Debian/etc.) +
+
    +
  1. Download and install the .deb package from the latest release. +
    # download the .deb for your architecture (amd64 or arm64)
    +ARCH="$(dpkg --print-architecture)"
    +VERSION="$(curl -fsSL https://api.github.com/repos/ArchiveBox/ArchiveBox/releases/latest | python3 -c "import sys,json; print(json.load(sys.stdin)['tag_name'].lstrip('v'))")"
    +curl -fsSL "https://github.com/ArchiveBox/ArchiveBox/releases/latest/download/archivebox_${VERSION}_${ARCH}.deb" -o /tmp/archivebox.deb
    +sudo apt install /tmp/archivebox.deb
    +archivebox version                         # make sure all dependencies are installed
    +
    +
  2. +
  3. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +archivebox init --install
     
    +
    +
  4. +
  5. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 â‡ĸ Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +
    +
  6. +
+See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See the debian-archivebox repo for more details about this distribution. +

+
-This is the recommended way to run ArchiveBox because it includes all the extractors like:
-chrome, wget, youtube-dl, git, etc., full-text search w/ sonic, and many other great features. +
+homebrew brew (macOS only) +
+
    +
  1. Install Homebrew on your system (if not already installed).
  2. +
  3. Install the ArchiveBox package using brew. +
    brew tap archivebox/archivebox
    +brew install archivebox
    +archivebox version                         # make sure all dependencies are installed
    +
    +See the Install: Bare Metal Wiki for more granular instructions for macOS... âžĄī¸ +
  4. +
  5. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +archivebox init --install
    +
    +
  6. +
  7. Optional: Start the server then login to the Web UI http://127.0.0.1:8000 â‡ĸ Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +

    +
  8. +
+See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See the homebrew-archivebox repo for more details about this distribution. +

-Get ArchiveBox with docker on macOS/Linux/Windows +Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more) +
-First make sure you have Docker installed: https://docs.docker.com/get-docker/ +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.* -

-# create a new empty directory and initalize your collection (can be anywhere)
-mkdir ~/archivebox && cd ~/archivebox
-docker run -v $PWD:/data -it archivebox/archivebox init --setup
-
-# start the webserver and open the UI (optional)
-docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
-open http://127.0.0.1:8000
-
-# you can also add links and manage your archive via the CLI:
-docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
-docker run -v $PWD:/data -it archivebox/archivebox status
-docker run -v $PWD:/data -it archivebox/archivebox help  # to see more options
-
-# when passing stdin/stdout via the cli, use only -i (not -it)
-echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add
-docker run -v $PWD:/data -i archivebox/archivebox list --html --with-headers > index.html
-
+ +See below for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
+ +
+ +#### 🎗  Other Options +
+Docker docker + Electron electron Desktop App (macOS/Linux/Windows) +
+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Download a binary release for your OS or build the native app from source
    + +
  4. +
+ +
+✨ Alpha (contributors wanted!): for more info, see the: Electron ArchiveBox repo. +
-Get ArchiveBox with apt on Ubuntu/Debian +Self-hosting Platforms TrueNAS / UNRAID / YunoHost / Cloudron / etc. (self-hosting solutions) +
-This method should work on all Ubuntu/Debian based systems, including x86, amd64, arm7, and arm8 CPUs (e.g. Raspberry Pis >=3). +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.* + + +See below for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
-If you're on Ubuntu >= 20.04, add the `apt` repository with `add-apt-repository`: -(on other Ubuntu/Debian-based systems follow the ♰ instructions below) +
+paid Paid hosting solutions (cloud VPS) +
+ -

-# add the repo to your sources and install the archivebox package using apt
-sudo apt install software-properties-common
-sudo add-apt-repository -u ppa:archivebox/archivebox
-sudo apt install archivebox
-
+For more discussion on managed and paid hosting options see here: Issue #531. -

-# create a new empty directory and initalize your collection (can be anywhere)
-mkdir ~/archivebox && cd ~/archivebox
-archivebox init --setup
+
-# start the webserver and open the web UI (optional) -archivebox server 0.0.0.0:8000 -open http://127.0.0.1:8000 - -# you can also add URLs and manage the archive via the CLI and filesystem: -archivebox add 'https://example.com' -archivebox status -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -archivebox help # to see more options -
+
-♰ On other Ubuntu/Debian-based systems add these sources directly to /etc/apt/sources.list: +#### âžĄī¸  Next Steps -

-echo "deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" > /etc/apt/sources.list.d/archivebox.list
-echo "deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main" >> /etc/apt/sources.list.d/archivebox.list
-sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys C258F79DCC02E369
-sudo apt update
-sudo apt install archivebox
-archivebox setup
-archivebox --version
-# then scroll back up and continue the initalization instructions above
-
+- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)... +- (Optional) Create a persona and import browser cookies to archive logged-in sites: `archivebox persona create --import=chrome personal` +- Tweak your UI or archiving behavior [Configuration](#configuration), read about some of the [Caveats](#caveats), or [Troubleshoot](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) +- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk... +- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)... -(you may need to install some other dependencies manually however) +
- +### Usage -
-Get ArchiveBox with brew on macOS +#### âšĄī¸  CLI Usage -First make sure you have Homebrew installed: https://brew.sh/#install +ArchiveBox commands can be run in a terminal [directly on your host](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage), or via [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage-1)/[Docker Compose](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage). +(depending on how you chose to install it above) -

-# install the archivebox package using homebrew
-brew install archivebox/archivebox/archivebox
+```bash
+mkdir -p ~/archivebox/data   # create a new data dir anywhere
+cd ~/archivebox/data         # IMPORTANT: cd into the directory
 
-# create a new empty directory and initalize your collection (can be anywhere)
-mkdir ~/archivebox && cd ~/archivebox
-archivebox init --setup
+# archivebox [subcommand] [--help]
+archivebox version
+archivebox help
 
-# start the webserver and open the web UI (optional)
-archivebox server 0.0.0.0:8000
-open http://127.0.0.1:8000
-
-# you can also add URLs and manage the archive via the CLI and filesystem:
-archivebox add 'https://example.com'
-archivebox status
-archivebox list --html --with-headers > index.html
-archivebox list --json --with-headers > index.json
-archivebox help  # to see more options
-
+# equivalent: docker compose run archivebox [subcommand] [--help] +docker compose run archivebox help -
+# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] +docker run -it -v $PWD:/data archivebox/archivebox help + +# optional: import your browser cookies into a persona for logged-in archiving +archivebox persona create --import=chrome personal +# supported: chrome/chromium/brave/edge (Chromium-based only) +# use --profile to target a specific profile (e.g. Default, Profile 1) +# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data +``` + +#### ArchiveBox Subcommands +- `archivebox` `help`/`version` to see the list of available subcommands / currently installed version info +- `archivebox` `setup`/`init`/`config`/`status`/`shell`/`manage` to administer your collection +- `archivebox` `add`/`schedule` to pull in fresh URLs from [bookmarks/history/RSS/etc.](#input-formats) +- `archivebox` `list`/`update`/`remove` to manage existing Snapshots in your collection + +
-Get ArchiveBox with pip on any other platforms (some extras must be installed manually) +curl sh automatic setup script CLI Usage Examples: non-Docker +
+

+# make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
+
+# archivebox [subcommand] [--help] +archivebox init --install # safe to run init multiple times (also how you update versions) +archivebox version # get archivebox version info + check dependencies +archivebox help # get list of archivebox subcommands that can be run +archivebox add --depth=1 'https://news.ycombinator.com' +
+For more info, see our Usage: CLI Usage wiki. âžĄī¸ +
-First make sure you have [Python >= v3.7](https://realpython.com/installing-python/) and [Node >= v12](https://nodejs.org/en/download/package-manager/) installed. +
+
+Docker CLI Usage Examples: Docker Compose +

-# install the archivebox package using pip3
-pip3 install archivebox
+# make sure you have `docker-compose.yml` from the Quickstart instructions first
+
+# docker compose run archivebox [subcommand] [--help] +docker compose run archivebox init --install +docker compose run archivebox version +docker compose run archivebox help +docker compose run archivebox add --depth=1 'https://news.ycombinator.com' +# to start webserver: docker compose up +
+For more info, see our Usage: Docker Compose CLI wiki. âžĄī¸ +
-# create a new empty directory and initalize your collection (can be anywhere) -mkdir ~/archivebox && cd ~/archivebox -archivebox init --setup -# Install any missing extras like wget/git/ripgrep/etc. manually as needed +
-# start the webserver and open the web UI (optional) -archivebox server 0.0.0.0:8000 -open http://127.0.0.1:8000 - -# you can also add URLs and manage the archive via the CLI and filesystem: -archivebox add 'https://example.com' -archivebox status -archivebox list --html --with-headers > index.html -archivebox list --json --with-headers > index.json -archivebox help # to see more options +
+Docker CLI Usage Examples: Docker +
+

+# make sure you create and cd into in a new empty directory first  
+
+# docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] +docker run -v $PWD:/data -it archivebox/archivebox init --install +docker run -v $PWD:/data -it archivebox/archivebox version +docker run -v $PWD:/data -it archivebox/archivebox help +docker run -v $PWD:/data -it archivebox/archivebox add --depth=1 'https://news.ycombinator.com' +# to start webserver: docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox
+For more info, see our Usage: Docker CLI wiki. âžĄī¸ +
+ +
+
+🗄  SQL/Python/Filesystem Usage +

+archivebox shell           # explore the Python library API in a REPL
+sqlite3 ./index.sqlite3    # run SQL queries directly on your index
+ls ./archive/*/index.html  # or inspect snapshot data directly on the filesystem
+
+For more info, see our Python Shell, SQL API, and Disk Layout wikis. âžĄī¸
-#### âšĄī¸  CLI Usage +
-```bash -# archivebox [subcommand] [--args] -# docker-compose run archivebox [subcommand] [--args] -# docker run -v $PWD:/data -it [subcommand] [--args] +
+đŸ–Ĩ  Web UI & API Usage +

+# Start the server on bare metal (pip/apt/brew/etc):
+archivebox manage createsuperuser              # create a new admin user via CLI
+archivebox server 0.0.0.0:8000                 # start the server
+
+# Or with Docker Compose: +nano docker-compose.yml # setup initial ADMIN_USERNAME & ADMIN_PASSWORD +docker compose up # start the server +
+# Or with a Docker container: +docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuperuser +docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +
-archivebox init --setup # safe to run init multiple times (also how you update versions) -archivebox --version -archivebox help -``` +Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI âžĄī¸
+Set LISTEN_HOST to change the base domain; web. and admin. subdomains are used automatically. +

+For more info, see our Usage: Web UI wiki. âžĄī¸ +

+Optional: Change permissions to allow non-logged-in users -- `archivebox setup/init/config/status/manage` to administer your collection -- `archivebox add/schedule/remove/update/list/shell/oneshot` to manage Snapshots in the archive -- `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats) +

+archivebox config --set PUBLIC_ADD_VIEW=True   # allow guests to submit URLs 
+archivebox config --set PUBLIC_SNAPSHOTS=True  # allow guests to see snapshot content
+archivebox config --set PUBLIC_INDEX=True      # allow guests to see list of all snapshots
+# or
+docker compose run archivebox config --set ...
 
-#### đŸ–Ĩ  Web UI Usage
+# restart the server to apply any config changes
+
+
-```bash -archivebox manage createsuperuser -archivebox server 0.0.0.0:8000 -``` -Then open http://127.0.0.1:8000 to view the UI. +
+
-```bash -# you can also configure whether or not login is required for most features -archivebox config --set PUBLIC_INDEX=False -archivebox config --set PUBLIC_SNAPSHOTS=False -archivebox config --set PUBLIC_ADD_VIEW=False -``` +> [!TIP] +> Whether in Docker or not, ArchiveBox commands work the same way, and can be used to access the same data on-disk. +> For example, you could run the Web UI in Docker Compose, and run one-off commands with `pip`-installed ArchiveBox. -#### 🗄  SQL/Python/Filesystem Usage +
+Expand to show comparison...
+ +

+archivebox add --depth=1 'https://example.com'                     # add a URL with pip-installed archivebox on the host
+docker compose run archivebox add --depth=1 'https://example.com'                       # or w/ Docker Compose
+docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://example.com'  # or w/ Docker, all equivalent
+
+ +For more info, see our Docker wiki. âžĄī¸ + +
-```bash -sqlite3 ./index.sqlite3 # run SQL queries on your index -archivebox shell # explore the Python API in a REPL -ls ./archive/*/index.html # or inspect snapshots on the filesystem -```
-
-grassgrass +
+grassgrass

-
+
. . . . . . . . . . . . . . . . . . . . . . . . . . . .

DEMO: https://demo.archivebox.io
@@ -337,54 +642,227 @@ ls ./archive/*/index.html # or inspect snapshots on the filesystem --- -
-lego +
+lego

# Overview -## Input formats + + +## Input Formats: How to pass URLs into ArchiveBox for saving + -ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exports, Browser bookmarks, Browser history, plain text, HTML, markdown, and more! +- From the official ArchiveBox Browser Extension + Provides realtime archiving of browsing history or selected pages from Chrome/Chromium/Firefox browsers. +- From manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, etc. files + ArchiveBox supports injesting URLs in [any text-based format](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file). -*Click these links for instructions on how to propare your links from these sources:* +- From manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (in Netscape format) + Instructions: Chrome, Firefox, Safari, IE, Opera, and more... + +- From URLs visited through a [MITM Proxy](https://mitmproxy.org/) with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) + Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy. + +- From bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.) + Instructions: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more... + + + -- TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file) -- [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)) -- [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user/export), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) ```bash # archivebox add --help archivebox add 'https://example.com/some/page' -archivebox add < ~/Downloads/firefox_bookmarks_export.html +archivebox add --parser=generic_rss < ~/Downloads/some_feed.xml archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' echo 'http://example.com' | archivebox add -echo 'any_text_with [urls](https://example.com) in it' | archivebox add +echo 'any text with urls in it' | archivebox add -# (if using docker add -i when piping stdin) -echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add - -# (if using docker-compose add -T when piping stdin / stdout) -echo 'https://example.com' | docker-compose run -T archivebox add +# if using Docker, add -i when piping stdin: +# echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add +# if using Docker Compose, add -T when piping stdin / stdout: +# echo 'https://example.com' | docker compose run -T archivebox add ``` See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. -It also includes a built-in scheduled import feature with `archivebox schedule` and browser bookmarklet, so you can pull in URLs from RSS feeds, websites, or the filesystem regularly/on-demand. +It also includes a built-in scheduled import feature with `archivebox schedule`, handled by the same orchestrator that powers `archivebox server`, so you can pull in URLs from RSS feeds and websites regularly without a separate cron container. + +
+ + + + +## Output Formats: What ArchiveBox saves for each URL + + + + +For each web page added, ArchiveBox creates a Snapshot folder and preserves its content as ordinary files inside the folder (e.g. HTML, PDF, PNG, JSON, etc.). + +It uses all available methods out-of-the-box, but you can disable extractors and fine-tune the [configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed. + +
+
+Expand to see the full list of ways it saves each page... + + +data/archive/{Snapshot.id}/
+
    +
  • Index: index.html & index.json HTML and JSON index files containing metadata and details
  • +
  • Title, Favicon, Headers Response headers, site favicon, and parsed site title
  • +
  • SingleFile: singlefile.html HTML snapshot rendered with headless Chrome using SingleFile
  • +
  • Wget Clone: example.com/page-name.html wget clone of the site with warc/TIMESTAMP.gz
  • +
  • Chrome Headless
      +
    • PDF: output.pdf Printed PDF of site using headless chrome
    • +
    • Screenshot: screenshot.png 1440x900 screenshot of site using headless chrome
    • +
    • DOM Dump: output.html DOM Dump of the HTML after rendering using headless chrome
    • +
  • +
  • Article Text: article.html/json Article text extraction using Readability & Mercury
  • +
  • Archive.org Permalink: archive.org.txt A link to the saved site on archive.org
  • +
  • Audio & Video: media/ all audio/video files + playlists, including subtitles & metadata w/ yt-dlp
  • +
  • Source Code: git/ clone of any repository found on GitHub, Bitbucket, or GitLab links
  • +
  • More coming soon! See the Roadmap...
  • +
+
+
+ +## Configuration + + +ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf`.
+
+Expand to see examples... +
archivebox config                               # view the entire config
+archivebox config --get CHROME_BINARY           # view a specific value
+
+archivebox config --set CHROME_BINARY=chromium # persist a config using CLI +# OR +echo CHROME_BINARY=chromium >> ArchiveBox.conf # persist a config using file +# OR +env CHROME_BINARY=chromium archivebox ... # run with a one-off config +
+These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details. +

+ +The configuration is documented here: **[Configuration Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**, and loaded from: [`archivebox/config/`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config/). + + +
+Expand to see the most common options to tweak... +

+# e.g. archivebox config --set TIMEOUT=120
+# or   docker compose run archivebox config --set TIMEOUT=120
+
+TIMEOUT=240 # default: 60 add more seconds on slower networks +CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL +
+PUBLIC_INDEX=True # default: True whether anon users can view index +PUBLIC_SNAPSHOTS=True # default: True whether anon users can view pages +PUBLIC_ADD_VIEW=False # default: False whether anon users can add new URLs +
+USER_AGENT="Mozilla/5.0 ..." # change this to get around bot blocking +
+
+
+ +## Dependencies + +To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. + +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage), [Django Ninja](https://django-ninja.dev/) for the REST API, and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [deterministic upgrades](https://stackoverflow.com/a/39976321/2156113). + +ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. + +
+
+Expand to learn more about ArchiveBox's internals & dependencies...
+ +
+

TIP: For better security while running ArchiveBox, and to avoid polluting your host system with a bunch of sub-dependencies that you need to keep up-to-date,it is strongly recommended to use the â­ī¸ official Docker image which provides everything in an easy container with simple one-liner upgrades.

+
+ +
    +
  • Language: Python >=3.13
  • +
  • Backend: Django + Django-Ninja for REST API
  • +
  • Frontend: Django Admin + Vanilla HTML, CSS, JS
  • +
  • Web Server: Django + daphne (ASGI)
  • +
  • Database: Django ORM saving to SQLite3 ./data/index.sqlite3
  • +
  • Job Queue: Custom orchestrator using supervisord for worker management
  • +
  • Build/test/lint: uv / pyright+ty+pytest / ruff
  • +
  • Subdependencies: abx-pkg installs apt/brew/pip/npm pkgs at runtime (e.g. yt-dlp, singlefile, readability, git)
  • +
+ + +These optional subdependencies used for archiving sites include: + +archivebox --version CLI output screenshot showing dependencies installed + +
    +
  • chromium / chrome (for screenshots, PDF, DOM HTML, and headless JS scripts)
  • +
  • node & npm (for readability, mercury, and singlefile)
  • +
  • wget (for plain HTML, static files, and WARC saving)
  • +
  • curl (for fetching headers, favicon, and posting to Archive.org)
  • +
  • yt-dlp or youtube-dl (for audio, video, and subtitles)
  • +
  • git (for cloning git repos)
  • +
  • singlefile (for saving into a self-contained html file)
  • +
  • postlight/parser (for discussion threads, forums, and articles)
  • +
  • readability (for articles and long text content)
  • +
  • and more as we grow...
  • +
+ +You don't need to install every dependency to use ArchiveBox. ArchiveBox will automatically disable extractors that rely on dependencies that aren't installed, based on what is configured and available in your $PATH. + +If not using Docker, make sure to keep the dependencies up-to-date yourself and check that ArchiveBox isn't reporting any incompatibility with the versions you install. + +
#install python3 and archivebox with your system package manager
+# apt/brew/pip/etc install ... (see Quickstart instructions above)
+
+which -a archivebox # see where you have installed archivebox +archivebox install # auto install all the extractors and extras +archivebox --version # see info and check validity of installed dependencies +
+ +Installing directly on Windows without Docker or WSL/WSL2/Cygwin is not officially supported (I cannot respond to Windows support tickets), but some advanced users have reported getting it working. + +

Learn More

+ + +
+
+ ## Archive Layout -All of ArchiveBox's state (including the index, snapshot data, and config file) is stored in a single folder called the "ArchiveBox data folder". All `archivebox` CLI commands must be run from inside this folder, and you first create it by running `archivebox init`. +All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in a single folder per collection. -The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard `index.sqlite3` database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the `./archive/` subfolder. +
+
+Expand to learn more about the layout of Archivebox's data on-disk...
-```bash -./ +Data folders can be created anywhere (`~/archivebox/data` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. +All archivebox CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init to initialize a new collection inside an empty directory. + +
mkdir -p ~/archivebox/data && cd ~/archivebox/data   # just an example, can be anywhere
+archivebox init
+ +The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the data/archive/ subfolder. + + + + +
data/
     index.sqlite3
     ArchiveBox.conf
     archive/
@@ -397,194 +875,272 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
             warc/1617687755.warc.gz
             git/somerepo.git
             ...
-```
-
-Each snapshot subfolder `./archive//` includes a static `index.json` and `index.html` describing its contents, and the snapshot extrator outputs are plain files within the folder.
-
-
- -## Output formats - -Inside each Snapshot folder, ArchiveBox save these different types of extractor outputs as plain files: - -`./archive//*` - -- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details -- **Title**, **Favicon**, **Headers** Response headers, site favicon, and parsed site title -- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile -- **Wget Clone:** `example.com/page-name.html` wget clone of the site with `warc/.gz` -- Chrome Headless - - **PDF:** `output.pdf` Printed PDF of site using headless chrome - - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome -- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury -- **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org -- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl -- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links -- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ +
-It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables / config. +Each snapshot subfolder data/archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder. -```bash -# archivebox config --help -archivebox config # see all currently configured options -archivebox config --set SAVE_ARCHIVE_DOT_ORG=False -archivebox config --set YOUTUBEDL_ARGS='--max-filesize=500m' -``` +

Learn More

+ +

+ ## Static Archive Exporting -You can export the main index to browse it statically without needing to run a server. +You can export your index as static HTML using `archivebox list` (so you can view it without an ArchiveBox server). -*Note about large exports: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.* +
+
+Expand to learn how to export your ArchiveBox collection...
-```bash| -# archivebox list --help +
+

NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the archivebox list command to export specific Snapshots or ranges.

+
+
# archivebox list --help
 archivebox list --html --with-headers > index.html     # export to static html table
 archivebox list --json --with-headers > index.json     # export to json blob
 archivebox list --csv=timestamp,url,title > index.csv  # export to csv spreadsheet
 
-# (if using docker-compose, add the -T flag when piping)
-docker-compose run -T archivebox list --html --filter-type=search snozzberries > index.json
-```
+# (if using Docker Compose, add the -T flag when piping)
+# docker compose run -T archivebox list --html 'https://example.com' > index.json
+
The paths in the static exports are relative, make sure to keep them next to your `./archive` folder when backing them up or viewing them. -
+

Learn More

-## Dependencies - -For better security, easier updating, and to avoid polluting your host system with extra dependencies, **it is strongly recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)** with everything preinstalled for the best experience. + -To achieve high fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party tools and libraries that specialize in extracting different types of content. These optional dependencies used for archiving sites include: +
+
-- `chromium` / `chrome` (for screenshots, PDF, DOM HTML, and headless JS scripts) -- `node` & `npm` (for readability, mercury, and singlefile) -- `wget` (for plain HTML, static files, and WARC saving) -- `curl` (for fetching headers, favicon, and posting to Archive.org) -- `youtube-dl` (for audio, video, and subtitles) -- `git` (for cloning git repos) -- and more as we grow... -You don't need to install every dependency to use ArchiveBox. ArchiveBox will automatically disable extractors that rely on dependencies that aren't installed, based on what is configured and available in your `$PATH`. +
+security graphic +
-*If using Docker, you don't have to install any of these manually, all dependencies are set up properly out-of-the-box*. -However, if you prefer not using Docker, you *can* install ArchiveBox and its dependencies using your [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) or `pip` directly on any Linux/macOS system. Just make sure to keep the dependencies up-to-date and check that ArchiveBox isn't reporting any incompatibility with the versions you install. +## Caveats -```bash -# install python3 and archivebox with your system package manager -# apt/brew/pip/etc install ... (see Quickstart instructions above) +### Archiving Private Content -archivebox setup # auto install all the extractors and extras -archivebox --version # see info and check validity of installed dependencies -``` + -Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported**, but some advanced users have reported getting it working. +If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**.
+
+Expand to learn about privacy, permissions, and user accounts... ---- -
-security graphic -
+
# don't save private content to ArchiveBox, e.g.:
+archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
+archivebox add 'https://vimeo.com/somePrivateVideo'
 
-## Caveats
+# restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
+archivebox config --set PUBLIC_INDEX=False
+archivebox config --set PUBLIC_SNAPSHOTS=False
+archivebox config --set PUBLIC_ADD_VIEW=False
+archivebox manage createsuperuser
+
-### Archiving Private URLs +
+

CAUTION: Assume anyone viewing your archives will be able to see any cookies, session tokens, or private URLs passed to ArchiveBox during archiving. +Make sure to secure your ArchiveBox data and don't share snapshots with others without stripping out sensitive headers and content first.

+
-If you're importing URLs containing secret slugs or pages with private content (e.g Google Docs, unlisted videos, etc), **you may want to disable some of the extractor modules to avoid leaking private URLs to 3rd party APIs** during the archiving process. +

Learn More

-```bash -# don't do this: -archivebox add 'https://docs.google.com/document/d/12345somelongsecrethere' -archivebox add 'https://example.com/any/url/you/want/to/keep/secret/' + -# without first disabling share the URL with 3rd party APIs: -archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org +
+
-# if extra paranoid or anti-google: -archivebox config --set SAVE_FAVICON=False # disable favicon fetching (it calls a google API) -archivebox config --set CHROME_BINARY=chromium # ensure it's using Chromium instead of Chrome -``` ### Security Risks of Viewing Archived JS -Be aware that malicious archived JS can access the contents of other pages in your archive when viewed. Because the Web UI serves all viewed snapshots from a single domain, they share a request context and **typical CSRF/CORS/XSS/CSP protections do not work to prevent cross-site request attacks**. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. +Be aware that malicious archived JS can access the contents of other pages in your archive when viewed. Because the Web UI serves all viewed snapshots from a single domain, they share a request context and **typical CSRF/CORS/XSS/CSP protections do not work to prevent cross-site request attacks**. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page and [Issue #239](https://github.com/ArchiveBox/ArchiveBox/issues/239) for more details. -```bash -# visiting an archived page with malicious JS: + +
+
+Expand to see risks and mitigations... + + +
# visiting an archived page with malicious JS:
 https://127.0.0.1:8000/archive/1602401954/example.com/index.html
 
 # example.com/index.js can now make a request to read everything from:
 https://127.0.0.1:8000/index.html
 https://127.0.0.1:8000/archive/*
 # then example.com/index.js can send it off to some evil server
-```
+
+ +
+

NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing.
+If you are worried about these issues ^ you should disable these extractors using:
archivebox config --set SAVE_WGET=False SAVE_DOM=False.

+
+ +

Learn More

+ + +
+
+ + +### Working Around Sites that Block Archiving + +For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) actively block archiving or bots in general. There are a number of approaches to work around this, and we also provide consulting services to help here. + +
+
+Click to learn how to set up user agents, cookies, and site logins... +
+ + + + +In the future we plan on adding support for running JS scripts during archiving to block ads, cookie popups, modals, and fix other issues. Follow here for progress: Issue #51. + +
+
+ ### Saving Multiple Snapshots of a Single URL -Support for saving multiple snapshots of each site over time will be [added eventually](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now **ArchiveBox is designed to only archive each URL with each extractor type once**. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: +ArchiveBox appends a hash with the current date `https://example.com#2020-10-24` to differentiate when a single URL is archived multiple times. -```bash -archivebox add 'https://example.com#2020-10-24' + +
+
+Click to learn how the Re-Snapshot feature works... +
+ + +Because ArchiveBox uniquely identifies snapshots by URL, it must use a workaround to take multiple snapshots of the same URL (otherwise they would show up as a single Snapshot entry). It makes the URLs of repeated snapshots unique by adding a hash with the archive date at the end: + +
archivebox add 'https://example.com#2020-10-24'
 ...
 archivebox add 'https://example.com#2020-10-25'
-```
+
+ +The Re-Snapshot Button button in the Admin UI is a shortcut for this hash-date multi-snapshotting workaround. + +Improved support for saving multiple snapshots of a single URL without this hash-date workaround will be added eventually (along with the ability to view diffs of the changes between runs). + +

Learn More

+ + + +
+
### Storage Requirements -Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. +Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE. -**ArchiveBox can use anywhere from ~1gb per 1000 articles, to ~50gb per 1000 articles**, mostly dependent on whether you're saving audio & video using `SAVE_MEDIA=True` and whether you lower `MEDIA_MAX_SIZE=750mb`. +
+
+Click to learn more about ArchiveBox's filesystem and hosting requirements... +
+ +
    +
  • ArchiveBox can use anywhere from ~1gb per 1000 Snapshots, to ~50gb per 1000 Snapshots, mostly dependent on whether you're saving video/audio using YTDLP_ENABLED=True and whether you lower YTDLP_MAX_SIZE=750m.
  • +
  • Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like fdupes or rdfind. +
  • +
  • Don't store large collections on older filesystems like EXT3/FAT as they may not be able to handle more than 50k directory entries in the data/archive/ folder. +
  • +
  • Try to keep the data/index.sqlite3 file on local drive (not a network mount) or SSD for maximum performance, however the data/archive/ folder can be on a network mount or slower HDD.
  • +
  • If using Docker or NFS/SMB/FUSE for the data/archive/ folder, you may need to set PUID & PGID and disable root_squash on your fileshare server. +
  • +
-Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. **Don't store large collections on older filesystems like EXT3/FAT** as they may not be able to handle more than 50k directory entries in the `archive/` folder. +

Learn More

-**Try to keep the `index.sqlite3` file on local drive (not a network mount)**, and ideally on an SSD for maximum performance, however the `archive/` folder can be on a network mount or spinning HDD. + + +

+ --- +
+ ## Screenshots
- + @@ -592,133 +1148,173 @@ Storage requirements can be reduced by using a compressed/deduplicated filesyste
---- -
- -
-paisley graphic +
+paisley graphic
+ # Background & Motivation -The aim of ArchiveBox is to enable more of the internet to be archived by empowering people to self-host their own archives. The intent is for all the web content you care about to be viewable with common software in 50 - 100 years without needing to run ArchiveBox or other specialized software to replay it. +ArchiveBox aims to enable more of the internet to be saved from deterioration by empowering people to self-host their own archives. The intent is for all the web content you care about to be viewable with common software in 50 - 100 years without needing to run ArchiveBox or other specialized software to replay it. + + +
+
+Click to read more about why archiving is important and how to do it ethically... +
+ Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity. -Whether it's to resist censorship by saving articles before they get taken down or edited, or just to save a collection of early 2010's flash games you love to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears. +Whether it's to resist censorship by saving news articles before they get taken down or edited, or just to save a collection of early 2010's flash games you loved to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears. -
-
- Image from WTF is Link Rot?...
+
+
+Image from Perma.cc...
-The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about. +The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about, just like libraries do. Without the work of archivists saving physical books, manuscrips, and paintings we wouldn't have any knowledge of our ancestors' history. I believe archiving the web is just as important to provide the same benefit to future generations. + +ArchiveBox's stance is that duplication of other people's content is only ethical if it: + +- A. doesn't deprive the original creators of revenue and +- B. is responsibly curated by an individual/institution. + +In the U.S., libraries, researchers, and archivists are allowed to duplicate copyrighted materials under "fair use" for private study, scholarship, or research. Archive.org's non-profit preservation work is covered under fair use in the US, and they properly handle unethical content/DMCA/GDPR removal requests to maintain good standing in the eyes of the law. + +As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use software like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in FOOTER_INFO and changing your instance's branding using CUSTOM_TEMPLATES_DIR). + +
+
-Because modern websites are complicated and often rely on dynamic content, -ArchiveBox archives the sites in **several different formats** beyond what public archiving services like Archive.org/Archive.is save. Using multiple methods and the market-dominant browser to execute JS ensures we can save even the most complex, finicky websites in at least a few high-quality, long-term data formats. ## Comparison to Other Projects -comparison +comparison + -â–ļ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** +> **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of alternative web archiving tools and orgs.** -A variety of open and closed-source archiving projects exist, but few provide a nice UI and CLI to manage a large, high-fidelity archive collection over time. +ArchiveBox gained momentum in the internet archiving industry because it uniquely combines 3 things: -ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (this is not recommended due to JS replay security concerns). +- **it's distributed:** users own their data instead of entrusting it to one big central provider +- **it's future-proof:** saving in *multiple formats* and extracting out raw TXT, PNG, PDF, MP4, etc. files +- **it's extensible:** with powerful APIs, flexible storage, and a big community adding new extractors regularly -### Comparison With Centralized Public Archives +
+
+Expand for a more direct comparison to Archive.org and specific open-source alternatives...
-Not all content is suitable to be archived in a centralized collection, wehther because it's private, copyrighted, too large, or too complex. ArchiveBox hopes to fill that gap. +ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service like Archive.org. -By having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other. +

Comparison With Centralized Public Archives

-### Comparison With Other Self-Hosted Archiving Options +Not all content is suitable to be archived on a centralized, publicly accessible platform. Archive.org doesn't offer the ability to save things behind login walls for good reason, as the content may not have been intended for a public audience. ArchiveBox exists to fill that gap by letting everyone save what they have access to on an individual basis, and to encourage decentralized archiving that's less succeptible to censorship or natural disasters. -ArchiveBox differentiates itself from [similar self-hosted projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by providing both a comprehensive CLI interface for managing your archive, a Web UI that can be used either indepenently or together with the CLI, and a simple on-disk data format that can be used without either. +By having users store their content locally or within their organizations, we can also save much larger portions of the internet than a centralized service has the disk capacity to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other, and with central archives on a case-by-case basis. -ArchiveBox is neither the highest fidelity, nor the simplest tool available for self-hosted archiving, rather it's a jack-of-all-trades that tries to do most things well by default. It can be as simple or advanced as you want, and is designed to do everything out-of-the-box but be tuned to suit your needs. +

Comparison With Other Self-Hosted Archiving Options

-*If being able to archive very complex interactive pages with JS and video is paramount, check out ArchiveWeb.page and ReplayWeb.page.* +ArchiveBox differentiates itself from [similar self-hosted projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by providing both a comprehensive CLI interface for managing your archive, a Web UI that can be used either independently or together with the CLI, and a simple on-disk data format that can be used without either. -*If you prefer a simpler, leaner solution that archives page text in markdown and provides note-taking abilities, check out Archivy or 22120.* + +*If you want better fidelity for very complex interactive pages with heavy JS/streams/API requests, check out [ArchiveWeb.page](https://archiveweb.page) and [ReplayWeb.page](https://replayweb.page).* + +*If you want more bookmark categorization and note-taking features, check out [Memex](https://github.com/WorldBrain/Memex), [Hoarder](https://github.com/hoarder-app/hoarder), [LinkWarden](https://github.com/linkwarden/linkwarden), [Archivy](https://archivy.github.io/), or [LinkAce](https://www.linkace.org/).* + +*If you need more advanced recursive spider/crawling ability beyond `--depth=1`, check out [Browsertrix](https://github.com/webrecorder/browsertrix-crawler), [Photon](https://github.com/s0md3v/Photon), or [Scrapy](https://scrapy.org/) and pipe the outputted URLs into ArchiveBox.* For more alternatives, see our [list here](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)... -
+ArchiveBox is neither the highest fidelity nor the simplest tool available for self-hosted archiving, rather it's a jack-of-all-trades that tries to do most things well by default. We encourage you to try these other tools made by our friends if ArchiveBox isn't suited to your needs. + +
+
-dependencies graphic -
+ + ## Internet Archiving Ecosystem -Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! + - +
+Our Community Wiki strives to be a comprehensive index of the web archiving industry... +
- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) - - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#the-master-lists) - _Community-maintained indexes of archiving tools and institutions._ - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#web-archiving-projects) - _Open source tools and projects in the internet archiving space._ + _List of ArchiveBox alternatives and open source projects in the internet archiving space._ + - [Awesome-Web-Archiving Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#the-master-lists) + _Community-maintained indexes of archiving tools and institutions like `iipc/awesome-web-archiving`._ - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#reading-list) _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#communities) _A collection of the most active internet archiving communities and initiatives._ - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) -- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. +- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://items.ssrc.org/parameters/on-the-importance-of-web-archiving/)" blog post. - Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter +
+
**Need help building a custom archiving solution?** -> ✨ **[Hire the team that helps build Archivebox](https://monadical.com) to work on your project.** (we're [@MonadicalSAS](https://twitter.com/MonadicalSAS) on Twitter) - -(They also do general software consulting across many industries) +> ✨ **[Hire the team that built Archivebox](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) to solve archiving for your org.** ([@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp))
---- -
-documentation graphic +
+documentation graphic
# Documentation - + -We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. +We use the [ArchiveBox GitHub Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) for documentation. -You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder. +There is also a mirror available on Read the Docs (though it's sometimes outdated). + +> âœī¸ You can submit docs changes & suggestions in our dedicated repo [`ArchiveBox/docs`](https://github.com/ArchiveBox/docs). ## Getting Started - [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) - [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) - [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) - -## Reference - - [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage) - [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) - [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) - [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) -- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) -- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) + +## Advanced + - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) +- [Cookies & Sessions Setup](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile) (archiving sites that require logins) +- [Setting up the Search Backends](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search) (choosing ripgrep, Sonic, or FTS5) +- [Setting up Local/Remote Storages](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Storage) (S3/B2/Google Drive/SMB/NFS/etc.) +- [Setting up Authentication & Permissions](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Authentication) (SSO/LDAP/OAuth/API Keys/etc.) +- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) (sharing your archive server with others) +- [Chromium Install Options](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) (installing and configuring ArchiveBox's Chrome) +- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives) - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) -- [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha) -- [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (alpha) + +## Developers + +- [Developer Documentation](https://github.com/ArchiveBox/ArchiveBox#archivebox-development) +- [Python API](https://docs.archivebox.io/) +- [REST API](https://demo.archivebox.io/api) (alpha) ## More Info -- [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues) +- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) - [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) -- [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) +- [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) - [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) - [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation) - [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) @@ -727,47 +1323,61 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http --- -
-development +
+development
# ArchiveBox Development All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. -Low hanging fruit / easy first tickets:
-Total alerts +For low hanging fruit / easy first tickets, see: ArchiveBox/Issues `#good first ticket` `#help wanted`. + +**Python API Documentation:** https://docs.archivebox.io/en/dev/archivebox.html#module-archivebox.main + +**Internal Architecture Diagrams:** https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams + ### Setup the dev environment
Click to expand... -#### 1. Clone the main code repo (making sure to pull the submodules as well) +#### 1. Setup the monorepo + +First make sure you have `uv` installed: https://docs.astral.sh/uv/getting-started/installation/ ```bash -git clone --recurse-submodules https://github.com/ArchiveBox/ArchiveBox -cd ArchiveBox -git checkout dev # or the branch you want to test -git submodule update --init --recursive -git pull --recurse-submodules +git clone https://github.com/ArchiveBox/monorepo +cd monorepo +./bin/setup.sh + +# activate the monorepo venv, then cd into archivebox +source .venv/bin/activate +cd archivebox ``` +Repos included in monorepo setup: + +- https://github.com/ArchiveBox/abxbus +- https://github.com/ArchiveBox/abx-pkg +- https://github.com/ArchiveBox/abx-plugins +- https://github.com/ArchiveBox/abx-dl +- https://github.com/ArchiveBox/ArchiveBox + + #### 2. Option A: Install the Python, JS, and system dependencies directly on your machine ```bash -# Install ArchiveBox + python dependencies -python3 -m venv .venv && source .venv/bin/activate && pip install -e '.[dev]' -# or: pipenv install --dev && pipenv shell +# Install ArchiveBox runtime dependencies +mkdir -p data && cd data +archivebox init +archivebox install # detect and install all extractor dependencies -# Install node dependencies -npm install -# or -archivebox setup +# Run the development server w/ autoreloading (but no bg workers) +archivebox server --debug --reload 0.0.0.0:8000 -# Check to see if anything is missing -archivebox --version -# install any missing dependencies manually, or use the helper script: -./bin/setup.sh +# Run the production server (with bg workers but no autoreloading) +archivebox server 0.0.0.0:8000 ``` #### 2. Option B: Build the docker container and use that for development instead @@ -776,14 +1386,18 @@ archivebox --version # Optional: develop via docker by mounting the code dir into the container # if you edit e.g. ./archivebox/core/models.py on the docker host, runserver # inside the container will reload and pick up your changes -docker build . -t archivebox -docker run -it archivebox init --setup -docker run -it -p 8000:8000 \ - -v $PWD/data:/data \ - -v $PWD/archivebox:/app/archivebox \ - archivebox server 0.0.0.0:8000 --debug --reload +./bin/build_docker.sh dev + +docker run -it -v $PWD/data:/data archivebox/archivebox:dev init --install + +# Run the development server w/ autoreloading (but no bg workers) +docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server --debug --reload 0.0.0.0:8000 + +# Run the production server (with bg workers but no autoreloading) +docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server # (remove the --reload flag and add the --nothreading flag when profiling with the django debug toolbar) +# When using --reload, make sure any files you create can be read by the user in the Docker container, eg with 'chmod a+rX'. ```
@@ -791,70 +1405,142 @@ docker run -it -p 8000:8000 \ ### Common development tasks See the `./bin/` folder and read the source of the bash scripts within. -You can also run all these in Docker. For more examples see the Github Actions CI/CD tests that are run: `.github/workflows/*.yaml`. +You can also run all these in Docker. For more examples see the GitHub Actions CI/CD tests that are run: `.github/workflows/*.yaml`. #### Run in DEBUG mode
Click to expand... ```bash +# set up persistent DEBUG=True for all runs archivebox config --set DEBUG=True + +# OR you can run a dev server with DEBUG=True in a few ways: +archivebox server --debug --reload 0.0.0.0:8000 +# or +archivebox server --debug 0.0.0.0:8000 # or -archivebox server --debug ... +env DEBUG=True daphne -b 0.0.0.0 -p 8000 archivebox.core.asgi:application ``` +https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running +
-#### Build and run a Github branch +#### Install and run a specific GitHub branch
Click to expand... +##### Use a Pre-Built Image + +If you're looking for the latest `dev` Docker image, it's often available pre-built on Docker Hub, simply pull and use `archivebox/archivebox:dev`. + ```bash -docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev -docker run -it -v $PWD:/data archivebox:dev ... +docker pull archivebox/archivebox:dev +docker run archivebox/archivebox:dev version +# verify the BUILD_TIME and COMMIT_HASH in the output are recent ``` -
+##### Build Branch from Source + +You can also build and run any branch yourself from source, for example to build & use `dev` locally: -#### Run the linters +```bash +# docker-compose.yml: +services: + archivebox: + image: archivebox/archivebox:dev + build: 'https://github.com/ArchiveBox/ArchiveBox.git#dev' + ... -
Click to expand... +# or with plain Docker: +docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev +docker run -it -v $PWD:/data archivebox:dev init -```bash -./bin/lint.sh +# or with pip: +pip install 'git+https://github.com/pirate/ArchiveBox@dev' +npm install 'git+https://github.com/ArchiveBox/ArchiveBox.git#dev' +archivebox install ``` -(uses `flake8` and `mypy`)
-#### Run the integration tests +#### Run the linters / tests
Click to expand... ```bash +./bin/lint.sh ./bin/test.sh ``` -(uses `pytest -s`) +(uses `ruff`, `pyright`, `ty`, and `pytest -s`)
-#### Make migrations or enter a django shell + +#### Make DB migrations, enter Django shell, other dev helper commands
Click to expand... -Make sure to run this whenever you change things in `models.py`. ```bash +# generate the database migrations after changes to models.py cd archivebox/ ./manage.py makemigrations +# enter a python shell or a SQL shell cd path/to/test/data/ archivebox shell archivebox manage dbshell + +# generate a graph of the ORM models +brew install graphviz +pip install pydot graphviz +archivebox manage graph_models -a -o orm.png +open orm.png + +# list all models with field db info and methods +archivebox manage list_model_info --all --signature --db-type --field-class + +# print all django settings +archivebox manage print_settings +archivebox manage print_settings --format=yaml # pip install pyyaml + +# autogenerate an admin.py from given app models +archivebox manage admin_generator core > core/admin.py + +# dump db data to a script that re-populates it +archivebox manage dumpscript core > scripts/testdata.py +archivebox manage reset core +archivebox manage runscript testdata + +# resetdb and clear all data! +archivebox manage reset_db + +# use django-tui to interactively explore commands +uv pip install django-tui +# ensure django-tui is in INSTALLED_APPS: core/settings.py +archivebox manage tui ``` -(uses `pytest -s`) + +ArchiveBox ORM models relatinoship graph + +- https://django-extensions.readthedocs.io/en/latest/command_extensions.html +- https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running +- https://github.com/anze3db/django-tui (explore `manage.py` commands as TUI) +- https://github.com/bloomberg/memray (advanced python profiler) +- https://github.com/laixintao/flameshow (display flamegraphs in terminal) +- https://github.com/taliraj/django-migrations-tui (explore migrations as TUI)
+#### Contributing a new extractor + + +Extractors are maintained in a separate repo here: https://github.com/ArchiveBox/abx-plugins (included in monorepo setup). + +Copy a similar plugin as a template to modify, then open a new PR to add it in that repo. + + #### Build the docs, pip package, and docker image
Click to expand... @@ -866,8 +1552,6 @@ archivebox manage dbshell # or individually: ./bin/build_docs.sh ./bin/build_pip.sh -./bin/build_deb.sh -./bin/build_brew.sh ./bin/build_docker.sh ``` @@ -884,8 +1568,6 @@ archivebox manage dbshell # or individually: ./bin/release_docs.sh ./bin/release_pip.sh -./bin/release_deb.sh -./bin/release_brew.sh ./bin/release_docker.sh ``` @@ -893,45 +1575,33 @@ archivebox manage dbshell --- -## Futher Reading +## Further Reading -- Home: https://archivebox.io -- Demo: https://demo.archivebox.io -- Docs: https://docs.archivebox.io -- Wiki: https://wiki.archivebox.io -- Issues: https://issues.archivebox.io -- Forum: https://forum.archivebox.io -- Releases: https://releases.archivebox.io -- Donations: https://github.com/sponsors/pirate + + +- [ArchiveBox.io Website](https://archivebox.io) / [ArchiveBox Github (Source Code)](https://github.com/ArchiveBox/ArchiveBox) / [ArchiveBox Demo Server](https://demo.archivebox.io) +- [Documentation (Github Wiki)](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs (ReadTheDocs)](https://docs.archivebox.io) / [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Bug Tracker (Github Issues)](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions (Github Discussions)](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) +- Find us on social media: [Twitter `@ArchiveBoxApp`](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) --- -
-

-
- -This project is maintained mostly in my spare time with the help from generous contributors and Monadical (✨ hire them for dev work!). - - +
+đŸ›ī¸ Contact us for professional support đŸ’Ŧ


- -
-Sponsor this project on Github -
-
- -
- - - - +   +   +   +   +   +
- - - -
- -✨ Have spare CPU/disk/bandwidth and want to help the world? Check out our Good Karma Kit... - +ArchiveBox operates as a US 501(c)(3) nonprofit FSP (sponsored by HCB), direct donations are tax-deductible. +

+  +  +

+
+✨ Have spare CPU/disk/bandwidth after all your įŊ‘įĢ™å­˜æĄŖįˆŦ and want to help the world?
Check out our Good Karma Kit...
diff --git a/_config.yml b/_config.yml deleted file mode 100644 index c50ff38dab..0000000000 --- a/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-merlot \ No newline at end of file diff --git a/archivebox/.flake8 b/archivebox/.flake8 index dd6ba8e47a..bb7176bd1f 100644 --- a/archivebox/.flake8 +++ b/archivebox/.flake8 @@ -3,4 +3,4 @@ ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E select = F,E9,W max-line-length = 130 max-complexity = 10 -exclude = migrations,tests,node_modules,vendor,static,venv,.venv,.venv2,.docker-venv +exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data* diff --git a/archivebox/LICENSE b/archivebox/LICENSE deleted file mode 120000 index ea5b60640b..0000000000 --- a/archivebox/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE \ No newline at end of file diff --git a/archivebox/__init__.py b/archivebox/__init__.py old mode 100644 new mode 100755 index b0c00b6118..bd464ac99c --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -1 +1,137 @@ -__package__ = 'archivebox' +#!/usr/bin/env python3 + +# Welcome to the ArchiveBox source code! Thanks for checking it out! +# +# "We are swimming upstream against a great torrent of disorganization. +# In this, our main obligation is to establish arbitrary enclaves of order and system. +# It is the greatest possible victory to be, to continue to be, and to have been. +# No defeat can deprive us of the success of having existed for some moment of time +# in a universe that seems indifferent to us." +# --Norber Weiner + +__package__ = "archivebox" + +import os +import sys +from pathlib import Path +from typing import Protocol, cast + +from abx_plugins import get_plugins_dir + + +class _ReconfigurableStream(Protocol): + def reconfigure(self, *, line_buffering: bool) -> object: ... + + +# Force unbuffered output for real-time logs +if hasattr(sys.stdout, "reconfigure"): + cast(_ReconfigurableStream, sys.stdout).reconfigure(line_buffering=True) + cast(_ReconfigurableStream, sys.stderr).reconfigure(line_buffering=True) +os.environ["PYTHONUNBUFFERED"] = "1" + +ASCII_LOGO = """ + █████╗ ██████╗ ██████╗██╗ ██╗██╗██╗ ██╗███████╗ ██████╗ ██████╗ ██╗ ██╗ +██╔══██╗██╔══██╗██╔════╝██║ ██║██║██║ ██║██╔════╝ ██╔══██╗██╔═══██╗╚██╗██╔╝ +███████║██████╔╝██║ ███████║██║██║ ██║█████╗ ██████╔╝██║ ██║ ╚███╔╝ +██╔══██║██╔══██╗██║ ██╔══██║██║╚██╗ ██╔╝██╔══╝ ██╔══██╗██║ ██║ ██╔██╗ +██║ ██║██║ ██║╚██████╗██║ ██║██║ ╚████╔╝ ███████╗ ██████╔╝╚██████╔╝██╔╝ ██╗ +╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝ +""" + +PACKAGE_DIR = Path(__file__).resolve().parent + +# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models +# # Migrations reference models like 'machine.Binary' which need to be importable +# if str(PACKAGE_DIR) not in sys.path: +# sys.path.append(str(PACKAGE_DIR)) + +os.environ["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings" +os.environ["TZ"] = "UTC" + +# detect ArchiveBox user's UID/GID based on data dir ownership +from .config.permissions import drop_privileges # noqa + +drop_privileges() + +from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa + +check_not_root() +check_not_inside_source_dir() +check_io_encoding() + +# Install monkey patches for third-party libraries +from .misc.monkey_patches import * # noqa + +# Plugin directories +BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() +USER_PLUGINS_DIR = ( + Path( + os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or os.environ.get("USER_PLUGINS_DIR") or os.environ.get("DATA_DIR", os.getcwd()), + ) + / "custom_plugins" +) + +# These are kept for backwards compatibility with existing code +# that checks for plugins. The new hook system uses discover_hooks() +ALL_PLUGINS = { + "builtin": BUILTIN_PLUGINS_DIR, + "user": USER_PLUGINS_DIR, +} +LOADED_PLUGINS = ALL_PLUGINS + +# Setup basic config, constants, paths, and version +from .config.constants import CONSTANTS # noqa +from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa +from .config.version import VERSION # noqa + +# Set MACHINE_ID env var so hook scripts can use it +os.environ.setdefault("MACHINE_ID", CONSTANTS.MACHINE_ID) + +__version__ = VERSION +__author__ = "ArchiveBox" +__license__ = "MIT" + +ASCII_ICON = """ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ +██████████████████████████████████████████████████████████████████████████████████████████████████ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ████████████████████████████████████ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ██ █████████████████████████ █ ██ + ██ ████████████████████████████████████ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██████████████████████████████████████████ ██ + ██ ██████████████████████████████████████████ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ██ ██ + ████████████████████████████████████████████████████████████████████████████████ +""" diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 8afaa27a06..125ae2051b 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -1,11 +1,20 @@ #!/usr/bin/env python3 +"""This is the entrypoint for python -m archivebox ...""" -__package__ = 'archivebox' +__package__ = "archivebox" +import archivebox # noqa # make sure monkey patches are applied before anything else import sys from .cli import main +ASCII_LOGO_MINI = r""" + _ _ _ ____ + / \ _ __ ___| |__ (_)_ _____| __ ) _____ __ + / _ \ | '__/ __| '_ \| \ \ / / _ \ _ \ / _ \ \/ / + / ___ \| | | (__| | | | |\ V / __/ |_) | (_) > < + /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\ +""" -if __name__ == '__main__': +if __name__ == "__main__": main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/api/__init__.py b/archivebox/api/__init__.py new file mode 100644 index 0000000000..24b3281374 --- /dev/null +++ b/archivebox/api/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.api" diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py new file mode 100644 index 0000000000..1a71f88cbf --- /dev/null +++ b/archivebox/api/admin.py @@ -0,0 +1,98 @@ +__package__ = "archivebox.api" + +from django.contrib import admin +from django.http import HttpRequest +from signal_webhooks.admin import WebhookAdmin +from signal_webhooks.utils import get_webhook_model + +from archivebox.base_models.admin import BaseModelAdmin + +from archivebox.api.models import APIToken + + +class APITokenAdmin(BaseModelAdmin): + list_display = ("created_at", "id", "created_by", "token_redacted", "expires") + sort_fields = ("id", "created_at", "created_by", "expires") + readonly_fields = ("created_at", "modified_at") + search_fields = ("id", "created_by__username", "token") + + fieldsets = ( + ( + "Token", + { + "fields": ("token", "expires"), + "classes": ("card",), + }, + ), + ( + "Owner", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("created_by",) + ordering = ["-created_at"] + list_per_page = 100 + + +class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin): + list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display) + sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error") + readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields) + + fieldsets = ( + ( + "Webhook", + { + "fields": ("name", "signal", "referenced_model", "endpoint"), + "classes": ("card", "wide"), + }, + ), + ( + "Authentication", + { + "fields": ("auth_token",), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("enabled", "last_success", "last_error"), + "classes": ("card",), + }, + ), + ( + "Owner", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool: + """Preserve WebhookAdmin's auth token filter with Django's current admin signature.""" + return not lookup.startswith("auth_token") and admin.ModelAdmin.lookup_allowed(self, lookup, value, request) + + +def register_admin(admin_site: admin.AdminSite) -> None: + admin_site.register(APIToken, APITokenAdmin) + admin_site.register(get_webhook_model(), CustomWebhookAdmin) diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py new file mode 100644 index 0000000000..94e2f6e558 --- /dev/null +++ b/archivebox/api/apps.py @@ -0,0 +1,14 @@ +__package__ = "archivebox.api" + +from django.apps import AppConfig + + +class APIConfig(AppConfig): + name = "archivebox.api" + label = "api" + + +def register_admin(admin_site): + from archivebox.api.admin import register_admin + + register_admin(admin_site) diff --git a/archivebox/api/auth.py b/archivebox/api/auth.py new file mode 100644 index 0000000000..5ef84d8b90 --- /dev/null +++ b/archivebox/api/auth.py @@ -0,0 +1,143 @@ +__package__ = "archivebox.api" + +from datetime import timedelta + +from django.utils import timezone +from django.http import HttpRequest +from django.contrib.auth import authenticate +from django.contrib.auth.models import User + +from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader, HttpBasicAuth +from ninja.errors import HttpError + + +def get_or_create_api_token(user: User | None): + from archivebox.api.models import APIToken + + if user and user.is_superuser: + api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now()) + if api_tokens.exists(): + # unexpired token exists, use it + api_token = api_tokens.last() + else: + # does not exist, create a new one + api_token = APIToken.objects.create(created_by_id=user.pk, expires=timezone.now() + timedelta(days=30)) + + if api_token is None: + return None + assert api_token.is_valid(), f"API token is not valid {api_token}" + + return api_token + return None + + +def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None: + """Given an API token string, check if a corresponding non-expired APIToken exists, and return its user""" + from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time + + user: User | None = None + + submitted_empty_form = str(token).strip() in ("string", "", "None", "null") + if not submitted_empty_form: + try: + api_token = APIToken.objects.get(token=token) + if api_token.is_valid() and isinstance(api_token.created_by, User): + user = api_token.created_by + if request is not None: + setattr(request, "_api_token", api_token) + except APIToken.DoesNotExist: + pass + + return user + + +def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None: + """Given a username and password, check if they are valid and return the corresponding user""" + user: User | None = None + + submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None)) + if not submitted_empty_form: + authenticated_user = authenticate( + username=username, + password=password, + ) + if isinstance(authenticated_user, User): + user = authenticated_user + return user + + +### Base Auth Types + + +def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None: + if user and user.pk: + request.user = user + setattr(request, "_api_auth_method", auth_method) + if not user.is_superuser: + raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)") + return user + + +### Django-Ninja-Provided Auth Methods + + +class HeaderTokenAuth(APIKeyHeader): + """Allow authenticating by passing X-API-Key=xyz as a request header""" + + param_name = "X-ArchiveBox-API-Key" + + def authenticate(self, request: HttpRequest, key: str | None) -> User | None: + return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__) + + +class BearerTokenAuth(HttpBearer): + """Allow authenticating by passing Bearer=xyz as a request header""" + + def authenticate(self, request: HttpRequest, token: str) -> User | None: + return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__) + + +class QueryParamTokenAuth(APIKeyQuery): + """Allow authenticating by passing api_key=xyz as a GET/POST query parameter""" + + param_name = "api_key" + + def authenticate(self, request: HttpRequest, key: str | None) -> User | None: + return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__) + + +class UsernameAndPasswordAuth(HttpBasicAuth): + """Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)""" + + def authenticate(self, request: HttpRequest, username: str, password: str) -> User | None: + return _require_superuser( + auth_using_password(username=username, password=password, request=request), + request, + self.__class__.__name__, + ) + + +class DjangoSessionAuth: + """Allow authenticating with existing Django session cookies (same-origin only).""" + + def __call__(self, request: HttpRequest) -> User | None: + return self.authenticate(request) + + def authenticate(self, request: HttpRequest, **kwargs) -> User | None: + user = getattr(request, "user", None) + if isinstance(user, User) and user.is_authenticated: + setattr(request, "_api_auth_method", self.__class__.__name__) + if not user.is_superuser: + raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)") + return user + return None + + +### Enabled Auth Methods + +API_AUTH_METHODS = [ + HeaderTokenAuth(), + BearerTokenAuth(), + QueryParamTokenAuth(), + # django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False +] diff --git a/archivebox/api/middleware.py b/archivebox/api/middleware.py new file mode 100644 index 0000000000..8932762dae --- /dev/null +++ b/archivebox/api/middleware.py @@ -0,0 +1,32 @@ +__package__ = "archivebox.api" + +from django.http import HttpResponse + + +class ApiCorsMiddleware: + """Attach permissive CORS headers for API routes (token-based auth).""" + + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request): + if request.path.startswith("/api/"): + if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"): + response = HttpResponse(status=204) + return self._add_cors_headers(request, response) + + response = self.get_response(request) + return self._add_cors_headers(request, response) + + return self.get_response(request) + + def _add_cors_headers(self, request, response): + origin = request.META.get("HTTP_ORIGIN") + if not origin: + return response + + response["Access-Control-Allow-Origin"] = "*" + response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS" + response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken" + response["Access-Control-Max-Age"] = "600" + return response diff --git a/archivebox/api/migrations/0001_initial.py b/archivebox/api/migrations/0001_initial.py new file mode 100644 index 0000000000..1f3e6f3dda --- /dev/null +++ b/archivebox/api/migrations/0001_initial.py @@ -0,0 +1,239 @@ +# Generated by hand on 2025-12-29 +# Creates APIToken and OutboundWebhook tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from archivebox.uuid_compat import uuid7 +from archivebox.base_models.models import get_or_create_system_user_pk +import archivebox.api.models +import signal_webhooks.fields +import signal_webhooks.utils + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("auth", "0012_alter_user_first_name_max_length"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create api_apitoken table + CREATE TABLE IF NOT EXISTS api_apitoken ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + token VARCHAR(32) NOT NULL UNIQUE, + expires DATETIME, + + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS api_apitoken_created_by_id_idx ON api_apitoken(created_by_id); + CREATE INDEX IF NOT EXISTS api_apitoken_created_at_idx ON api_apitoken(created_at); + CREATE INDEX IF NOT EXISTS api_apitoken_token_idx ON api_apitoken(token); + + -- Create api_outboundwebhook table + CREATE TABLE IF NOT EXISTS api_outboundwebhook ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + name VARCHAR(255) NOT NULL UNIQUE, + signal VARCHAR(255) NOT NULL, + ref VARCHAR(1024) NOT NULL, + endpoint VARCHAR(2048) NOT NULL, + headers TEXT NOT NULL DEFAULT '{}', + auth_token TEXT NOT NULL DEFAULT '', + enabled BOOLEAN NOT NULL DEFAULT 1, + keep_last_response BOOLEAN NOT NULL DEFAULT 0, + created DATETIME NOT NULL, + updated DATETIME NOT NULL, + last_response TEXT NOT NULL DEFAULT '', + last_success DATETIME, + last_failure DATETIME, + + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_created_by_id_idx ON api_outboundwebhook(created_by_id); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_created_at_idx ON api_outboundwebhook(created_at); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_name_idx ON api_outboundwebhook(name); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_ref_idx ON api_outboundwebhook(ref); + """, + reverse_sql=""" + DROP TABLE IF EXISTS api_outboundwebhook; + DROP TABLE IF EXISTS api_apitoken; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="APIToken", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)), + ("expires", models.DateTimeField(blank=True, null=True)), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "API Key", + "verbose_name_plural": "API Keys", + "app_label": "api", + }, + ), + migrations.CreateModel( + name="OutboundWebhook", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ( + "name", + models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"), + ), + ( + "signal", + models.CharField( + choices=[ + ("CREATE", "Create"), + ("UPDATE", "Update"), + ("DELETE", "Delete"), + ("M2M", "M2M changed"), + ("CREATE_OR_UPDATE", "Create or Update"), + ("CREATE_OR_DELETE", "Create or Delete"), + ("CREATE_OR_M2M", "Create or M2M changed"), + ("UPDATE_OR_DELETE", "Update or Delete"), + ("UPDATE_OR_M2M", "Update or M2M changed"), + ("DELETE_OR_M2M", "Delete or M2M changed"), + ("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"), + ("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"), + ("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"), + ("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"), + ("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"), + ], + help_text="Signal the webhook fires to.", + max_length=255, + verbose_name="signal", + ), + ), + ( + "ref", + models.CharField( + db_index=True, + help_text="Dot import notation to the model the webhook is for.", + max_length=1023, + validators=[signal_webhooks.utils.model_from_reference], + verbose_name="referenced model", + ), + ), + ( + "endpoint", + models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"), + ), + ( + "headers", + models.JSONField( + blank=True, + default=dict, + help_text="Headers to send with the webhook request.", + validators=[signal_webhooks.utils.is_dict], + verbose_name="headers", + ), + ), + ( + "auth_token", + signal_webhooks.fields.TokenField( + blank=True, + default="", + help_text="Authentication token to use in an Authorization header.", + max_length=8000, + validators=[signal_webhooks.utils.decode_cipher_key], + verbose_name="authentication token", + ), + ), + ("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")), + ( + "keep_last_response", + models.BooleanField( + default=False, + help_text="Should the webhook keep a log of the latest response it got?", + verbose_name="keep last response", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"), + ), + ( + "last_response", + models.CharField( + blank=True, + default="", + help_text="Latest response to this webhook.", + max_length=8000, + verbose_name="last response", + ), + ), + ( + "last_success", + models.DateTimeField( + default=None, + help_text="When the webhook last succeeded.", + null=True, + verbose_name="last success", + ), + ), + ( + "last_failure", + models.DateTimeField( + default=None, + help_text="When the webhook last failed.", + null=True, + verbose_name="last failure", + ), + ), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "API Outbound Webhook", + "app_label": "api", + }, + ), + migrations.AddConstraint( + model_name="outboundwebhook", + constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"), + ), + ], + ), + ] diff --git a/archivebox/search/backends/__init__.py b/archivebox/api/migrations/__init__.py similarity index 100% rename from archivebox/search/backends/__init__.py rename to archivebox/api/migrations/__init__.py diff --git a/archivebox/api/models.py b/archivebox/api/models.py new file mode 100755 index 0000000000..2fbfabe679 --- /dev/null +++ b/archivebox/api/models.py @@ -0,0 +1,54 @@ +__package__ = "archivebox.api" + +import secrets +from archivebox.uuid_compat import uuid7 + +from django.conf import settings +from django.db import models +from django.utils import timezone +from django_stubs_ext.db.models import TypedModelMeta +from signal_webhooks.models import WebhookBase + +from archivebox.base_models.models import get_or_create_system_user_pk + + +def generate_secret_token() -> str: + return secrets.token_hex(16) + + +class APIToken(models.Model): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + token = models.CharField(max_length=32, default=generate_secret_token, unique=True) + expires = models.DateTimeField(null=True, blank=True) + + class Meta(TypedModelMeta): + app_label = "api" + verbose_name = "API Key" + verbose_name_plural = "API Keys" + + def __str__(self) -> str: + return self.token + + @property + def token_redacted(self): + return f"************{self.token[-4:]}" + + def is_valid(self, for_date=None): + return not self.expires or self.expires >= (for_date or timezone.now()) + + +class OutboundWebhook(WebhookBase): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + class Meta(WebhookBase.Meta): + app_label = "api" + verbose_name = "API Outbound Webhook" + + def __str__(self) -> str: + return f"[{self.id}] {self.ref} -> {self.endpoint}" diff --git a/archivebox/api/urls.py b/archivebox/api/urls.py new file mode 100644 index 0000000000..5324e2173d --- /dev/null +++ b/archivebox/api/urls.py @@ -0,0 +1,16 @@ +__package__ = "archivebox.api" + +from django.urls import path +from django.views.generic.base import RedirectView + +from .v1_api import urls as v1_api_urls + +urlpatterns = [ + path("", RedirectView.as_view(url="/api/v1/docs")), + path("v1/", RedirectView.as_view(url="/api/v1/docs")), + path("v1/", v1_api_urls), + path("v1", RedirectView.as_view(url="/api/v1/docs")), + # ... v2 can be added here ... + # path("v2/", v2_api_urls), + # path("v2", RedirectView.as_view(url='/api/v2/docs')), +] diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py new file mode 100644 index 0000000000..e45b8125db --- /dev/null +++ b/archivebox/api/v1_api.py @@ -0,0 +1,135 @@ +__package__ = "archivebox.api" + + +from io import StringIO +from traceback import format_exception +from contextlib import redirect_stdout, redirect_stderr + +from django.http import HttpRequest, HttpResponse +from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied +from django.contrib.auth.models import User + +from ninja import NinjaAPI, Swagger + +# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/ + +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH + +from archivebox.api.auth import API_AUTH_METHODS +from archivebox.api.models import APIToken + + +COMMIT_HASH = get_COMMIT_HASH() or "unknown" + +html_description = f""" +

Welcome to your ArchiveBox server's REST API [v1 ALPHA] homepage!

+
+WARNING: This API is still in an early development stage and may change! +
+ +Served by ArchiveBox v{VERSION} ({COMMIT_HASH[:8]}), API powered by django-ninja. +""" + + +def register_urls(api: NinjaAPI) -> NinjaAPI: + api.add_router("/auth/", "archivebox.api.v1_auth.router") + api.add_router("/core/", "archivebox.api.v1_core.router") + api.add_router("/crawls/", "archivebox.api.v1_crawls.router") + api.add_router("/cli/", "archivebox.api.v1_cli.router") + api.add_router("/machine/", "archivebox.api.v1_machine.router") + return api + + +class NinjaAPIWithIOCapture(NinjaAPI): + def create_temporal_response(self, request: HttpRequest) -> HttpResponse: + stdout, stderr = StringIO(), StringIO() + + with redirect_stderr(stderr): + with redirect_stdout(stdout): + setattr(request, "stdout", stdout) + setattr(request, "stderr", stderr) + + response = super().create_temporal_response(request) + + # Disable caching of API responses entirely + response["Cache-Control"] = "no-store" + + # Add debug stdout and stderr headers to response + response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200] + response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200] + # response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown' + + # Add Auth Headers to response + api_token_attr = getattr(request, "_api_token", None) + api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None + token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never" + + response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None")) + response["X-ArchiveBox-Auth-Expires"] = token_expiry + response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None" + response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None" + response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None" + + # import ipdb; ipdb.set_trace() + # print('RESPONDING NOW', response) + + return response + + +api = NinjaAPIWithIOCapture( + title="ArchiveBox API", + description=html_description, + version=VERSION, + auth=API_AUTH_METHODS, + urls_namespace="api-1", + docs=Swagger(settings={"persistAuthorization": True}), + # docs_decorator=login_required, + # renderer=ORJSONRenderer(), +) +api = register_urls(api) +urls = api.urls + + +@api.exception_handler(Exception) +def generic_exception_handler(request, err): + status = 503 + if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)): + status = 404 + + print("".join(format_exception(err))) + + return api.create_response( + request, + { + "succeeded": False, + "message": f"{err.__class__.__name__}: {err}", + "errors": [ + "".join(format_exception(err)), + # or send simpler parent-only traceback: + # *([str(err.__context__)] if getattr(err, '__context__', None) else []), + ], + }, + status=status, + ) + + +# import orjson +# from ninja.renderers import BaseRenderer +# class ORJSONRenderer(BaseRenderer): +# media_type = "application/json" +# def render(self, request, data, *, response_status): +# return { +# "success": True, +# "errors": [], +# "result": data, +# "stdout": ansi_to_html(stdout.getvalue().strip()), +# "stderr": ansi_to_html(stderr.getvalue().strip()), +# } +# return orjson.dumps(data) diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py new file mode 100644 index 0000000000..e8c61e173f --- /dev/null +++ b/archivebox/api/v1_auth.py @@ -0,0 +1,65 @@ +__package__ = "archivebox.api" + +from django.http import HttpRequest + +from ninja import Router, Schema + +from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token + + +router = Router(tags=["Authentication"], auth=None) + + +class PasswordAuthSchema(Schema): + """Schema for a /get_api_token request""" + + username: str | None = None + password: str | None = None + + +@router.post( + "/get_api_token", + auth=None, + summary="Generate an API token for a given username & password (or currently logged-in user)", +) # auth=None because they are not authed yet +def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema): + user = auth_using_password( + username=auth_data.username, + password=auth_data.password, + request=request, + ) + + if user and user.is_superuser: + api_token = get_or_create_api_token(user) + assert api_token is not None, "Failed to create API token" + return { + "success": True, + "user_id": str(user.pk), + "username": user.username, + "token": api_token.token, + "expires": api_token.expires.isoformat() if api_token.expires else None, + } + + return {"success": False, "errors": ["Invalid credentials"]} + + +class TokenAuthSchema(Schema): + """Schema for a /check_api_token request""" + + token: str + + +@router.post( + "/check_api_token", + auth=None, + summary="Validate an API token to make sure its valid and non-expired", +) # auth=None because they are not authed yet +def check_api_token(request: HttpRequest, token_data: TokenAuthSchema): + user = auth_using_token( + token=token_data.token, + request=request, + ) + if user: + return {"success": True, "user_id": str(user.pk)} + + return {"success": False, "user_id": None} diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py new file mode 100644 index 0000000000..2c317ad4d1 --- /dev/null +++ b/archivebox/api/v1_cli.py @@ -0,0 +1,284 @@ +__package__ = "archivebox.api" + +import json +from io import StringIO +from typing import Any +from enum import Enum + +from django.http import HttpRequest + +from ninja import Router, Schema + +from archivebox.misc.util import ansi_to_html +from archivebox.config.common import ARCHIVING_CONFIG + + +# from .auth import API_AUTH_METHODS + +# router for API that exposes archivebox cli subcommands as REST endpoints +router = Router(tags=["ArchiveBox CLI Sub-Commands"]) + + +# Schemas + +JSONType = list[Any] | dict[str, Any] | bool | int | str | None + + +class CLICommandResponseSchema(Schema): + success: bool + errors: list[str] + result: JSONType + result_format: str = "str" + stdout: str + stderr: str + + +class FilterTypeChoices(str, Enum): + exact = "exact" + substring = "substring" + regex = "regex" + domain = "domain" + tag = "tag" + timestamp = "timestamp" + + +class StatusChoices(str, Enum): + indexed = "indexed" + archived = "archived" + unarchived = "unarchived" + present = "present" + valid = "valid" + invalid = "invalid" + duplicate = "duplicate" + orphaned = "orphaned" + corrupted = "corrupted" + unrecognized = "unrecognized" + + +class AddCommandSchema(Schema): + urls: list[str] + tag: str = "" + depth: int = 0 + parser: str = "auto" + plugins: str = "" + update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW + overwrite: bool = False + index_only: bool = False + + +class UpdateCommandSchema(Schema): + resume: str | None = None + after: float | None = 0 + before: float | None = 999999999999999 + filter_type: str | None = FilterTypeChoices.substring + filter_patterns: list[str] | None = ["https://example.com"] + batch_size: int = 100 + continuous: bool = False + + +class ScheduleCommandSchema(Schema): + import_path: str | None = None + add: bool = False + show: bool = False + foreground: bool = False + run_all: bool = False + quiet: bool = False + every: str | None = None + tag: str = "" + depth: int = 0 + overwrite: bool = False + update: bool = not ARCHIVING_CONFIG.ONLY_NEW + clear: bool = False + + +class ListCommandSchema(Schema): + filter_patterns: list[str] | None = ["https://example.com"] + filter_type: str = FilterTypeChoices.substring + status: StatusChoices = StatusChoices.indexed + after: float | None = 0 + before: float | None = 999999999999999 + sort: str = "bookmarked_at" + as_json: bool = True + as_html: bool = False + as_csv: str | None = "timestamp,url" + with_headers: bool = False + + +class RemoveCommandSchema(Schema): + delete: bool = True + after: float | None = 0 + before: float | None = 999999999999999 + filter_type: str = FilterTypeChoices.exact + filter_patterns: list[str] | None = ["https://example.com"] + + +@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]") +def cli_add(request: HttpRequest, args: AddCommandSchema): + from archivebox.cli.archivebox_add import add + + crawl, snapshots = add( + urls=args.urls, + tag=args.tag, + depth=args.depth, + update=args.update, + index_only=args.index_only, + overwrite=args.overwrite, + plugins=args.plugins, + parser=args.parser, + bg=True, # Always run in background for API calls + created_by_id=request.user.pk, + ) + + snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)] + result_payload = { + "crawl_id": str(crawl.id), + "num_snapshots": len(snapshot_ids), + "snapshot_ids": snapshot_ids, + "queued_urls": args.urls, + } + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) + + return { + "success": True, + "errors": [], + "result": result_payload, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]") +def cli_update(request: HttpRequest, args: UpdateCommandSchema): + from archivebox.cli.archivebox_update import update + + result = update( + filter_patterns=args.filter_patterns or [], + filter_type=args.filter_type or FilterTypeChoices.substring, + after=args.after, + before=args.before, + resume=args.resume, + batch_size=args.batch_size, + continuous=args.continuous, + ) + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) + return { + "success": True, + "errors": [], + "result": result, + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]") +def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema): + from archivebox.cli.archivebox_schedule import schedule + + result = schedule( + import_path=args.import_path, + add=args.add, + show=args.show, + foreground=args.foreground, + run_all=args.run_all, + quiet=args.quiet, + clear=args.clear, + every=args.every, + tag=args.tag, + depth=args.depth, + overwrite=args.overwrite, + update=args.update, + ) + + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) + return { + "success": True, + "errors": [], + "result": result, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]") +def cli_search(request: HttpRequest, args: ListCommandSchema): + from archivebox.cli.archivebox_search import search + + result = search( + filter_patterns=args.filter_patterns, + filter_type=args.filter_type, + status=args.status, + after=args.after, + before=args.before, + sort=args.sort, + csv=args.as_csv, + json=args.as_json, + html=args.as_html, + with_headers=args.with_headers, + ) + + result_format = "txt" + if args.as_json: + result_format = "json" + result = json.loads(result) + elif args.as_html: + result_format = "html" + elif args.as_csv: + result_format = "csv" + + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) + return { + "success": True, + "errors": [], + "result": result, + "result_format": result_format, + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]") +def cli_remove(request: HttpRequest, args: RemoveCommandSchema): + from archivebox.cli.archivebox_remove import remove + from archivebox.cli.archivebox_search import get_snapshots + from archivebox.core.models import Snapshot + + filter_patterns = args.filter_patterns or [] + snapshots_to_remove = get_snapshots( + filter_patterns=filter_patterns, + filter_type=args.filter_type, + after=args.after, + before=args.before, + ) + removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)] + + remove( + yes=True, # no way to interactively ask for confirmation via API, so we force yes + delete=args.delete, + snapshots=snapshots_to_remove, + before=args.before, + after=args.after, + filter_type=args.filter_type, + filter_patterns=filter_patterns, + ) + + result = { + "removed_count": len(removed_snapshot_ids), + "removed_snapshot_ids": removed_snapshot_ids, + "remaining_snapshots": Snapshot.objects.count(), + } + stdout = getattr(request, "stdout", None) + stderr = getattr(request, "stderr", None) + return { + "success": True, + "errors": [], + "result": result, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py new file mode 100644 index 0000000000..8f4b4ae038 --- /dev/null +++ b/archivebox/api/v1_core.py @@ -0,0 +1,829 @@ +__package__ = "archivebox.api" + +import math +from collections import defaultdict +from uuid import UUID +from typing import Union, Any, Annotated +from datetime import datetime + +from django.db.models import Model, Q, Sum +from django.db.models.functions import Coalesce +from django.conf import settings +from django.http import HttpRequest, HttpResponse +from django.core.exceptions import ValidationError +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User +from django.shortcuts import redirect +from django.utils import timezone + +from ninja import Router, Schema, FilterLookup, FilterSchema, Query +from ninja.pagination import paginate, PaginationBase +from ninja.errors import HttpError + +from archivebox.core.models import Snapshot, ArchiveResult, Tag +from archivebox.api.auth import auth_using_token +from archivebox.config.common import SERVER_CONFIG +from archivebox.core.tag_utils import ( + build_tag_cards, + delete_tag as delete_tag_record, + export_tag_snapshots_jsonl, + export_tag_urls, + get_matching_tags, + get_or_create_tag, + get_tag_by_ref, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, + rename_tag as rename_tag_record, +) +from archivebox.crawls.models import Crawl +from archivebox.api.v1_crawls import CrawlSchema + + +router = Router(tags=["Core Models"]) + + +class CustomPagination(PaginationBase): + class Input(PaginationBase.Input): + limit: int = 200 + offset: int = 0 + page: int = 0 + + class Output(PaginationBase.Output): + count: int + total_items: int + total_pages: int + page: int + limit: int + offset: int + num_items: int + items: list[Any] + + def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params): + limit = min(pagination.limit, 500) + offset = pagination.offset or (pagination.page * limit) + total = queryset.count() + total_pages = math.ceil(total / limit) + current_page = math.ceil(offset / (limit + 1)) + items = queryset[offset : offset + limit] + return { + "count": total, + "total_items": total, + "total_pages": total_pages, + "page": current_page, + "limit": limit, + "offset": offset, + "num_items": len(items), + "items": items, + } + + +### ArchiveResult ######################################################################### + + +class MinimalArchiveResultSchema(Schema): + TYPE: str = "core.models.ArchiveResult" + id: UUID + created_at: datetime | None + modified_at: datetime | None + created_by_id: str + created_by_username: str + status: str + retry_at: datetime | None = None + plugin: str + hook_name: str + process_id: UUID | None + cmd_version: str | None + cmd: list[str] | None + pwd: str | None + output_str: str + output_json: dict[str, Any] | None + output_files: dict[str, dict[str, Any]] | None + output_size: int + output_mimetypes: str + start_ts: datetime | None + end_ts: datetime | None + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by.pk) + + @staticmethod + def resolve_created_by_username(obj) -> str: + return obj.created_by.username + + @staticmethod + def resolve_output_files(obj): + return obj.output_file_map() + + @staticmethod + def resolve_output_mimetypes(obj) -> str: + mime_sizes: dict[str, int] = defaultdict(int) + for metadata in obj.output_file_map().values(): + if not isinstance(metadata, dict): + continue + mimetype = str(metadata.get("mimetype") or "").strip() + try: + size = max(int(metadata.get("size") or 0), 0) + except (TypeError, ValueError): + size = 0 + if mimetype and size: + mime_sizes[mimetype] += size + if mime_sizes: + return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return obj.output_mimetypes or "" + + +class ArchiveResultSchema(MinimalArchiveResultSchema): + TYPE: str = "core.models.ArchiveResult" + snapshot_id: UUID + snapshot_timestamp: str + snapshot_url: str + snapshot_tags: list[str] + + @staticmethod + def resolve_snapshot_timestamp(obj): + return obj.snapshot.timestamp + + @staticmethod + def resolve_snapshot_url(obj): + return obj.snapshot.url + + @staticmethod + def resolve_snapshot_id(obj): + return obj.snapshot_id + + @staticmethod + def resolve_snapshot_tags(obj): + return sorted(tag.name for tag in obj.snapshot.tags.all()) + + +class ArchiveResultFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + search: Annotated[ + str | None, + FilterLookup( + [ + "snapshot__url__icontains", + "snapshot__title__icontains", + "snapshot__tags__name__icontains", + "plugin", + "output_str__icontains", + "id__startswith", + "snapshot__id__startswith", + "snapshot__timestamp__startswith", + ], + ), + ] = None + snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None + snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None + status: Annotated[str | None, FilterLookup("status")] = None + output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None + plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None + hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None + process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None + cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None + pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None + cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None + + +@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult") +@paginate(CustomPagination) +def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]): + """List all ArchiveResult entries matching these filters.""" + return filters.filter(ArchiveResult.objects.all()).distinct() + + +@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult") +def get_archiveresult(request: HttpRequest, archiveresult_id: str): + """Get a specific ArchiveResult by id.""" + return ArchiveResult.objects.get(Q(id__icontains=archiveresult_id)) + + +### Snapshot ######################################################################### + + +class SnapshotSchema(Schema): + TYPE: str = "core.models.Snapshot" + id: UUID + created_by_id: str + created_by_username: str + created_at: datetime + modified_at: datetime + status: str + retry_at: datetime | None + bookmarked_at: datetime + downloaded_at: datetime | None + url: str + tags: list[str] + title: str | None + timestamp: str + archive_path: str + archive_size: int + output_size: int + num_archiveresults: int + archiveresults: list[MinimalArchiveResultSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by.pk) + + @staticmethod + def resolve_created_by_username(obj): + return obj.created_by.username + + @staticmethod + def resolve_tags(obj): + return sorted(tag.name for tag in obj.tags.all()) + + @staticmethod + def resolve_archive_size(obj): + return int(getattr(obj, "output_size_sum", obj.archive_size) or 0) + + @staticmethod + def resolve_output_size(obj): + return SnapshotSchema.resolve_archive_size(obj) + + @staticmethod + def resolve_num_archiveresults(obj, context): + return obj.archiveresult_set.all().distinct().count() + + @staticmethod + def resolve_archiveresults(obj, context): + if bool(getattr(context["request"], "with_archiveresults", False)): + return obj.archiveresult_set.all().distinct() + return ArchiveResult.objects.none() + + +class SnapshotUpdateSchema(Schema): + status: str | None = None + retry_at: datetime | None = None + tags: list[str] | None = None + + +class SnapshotCreateSchema(Schema): + url: str + crawl_id: str | None = None + depth: int = 0 + title: str | None = None + tags: list[str] | None = None + status: str | None = None + + +class SnapshotDeleteResponseSchema(Schema): + success: bool + snapshot_id: str + crawl_id: str + deleted_count: int + + +def normalize_tag_list(tags: list[str] | None = None) -> list[str]: + return [tag.strip() for tag in (tags or []) if tag and tag.strip()] + + +class SnapshotFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None + created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None + created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None + modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None + modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None + search: Annotated[ + str | None, + FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]), + ] = None + url: Annotated[str | None, FilterLookup("url")] = None + tag: Annotated[str | None, FilterLookup("tags__name")] = None + title: Annotated[str | None, FilterLookup("title__icontains")] = None + timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None + bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None + bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None + + +@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots") +@paginate(CustomPagination) +def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False): + """List all Snapshot entries matching these filters.""" + setattr(request, "with_archiveresults", with_archiveresults) + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)) + return filters.filter(queryset).distinct() + + +@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") +def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True): + """Get a specific Snapshot by id.""" + setattr(request, "with_archiveresults", with_archiveresults) + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)) + try: + return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) + except Snapshot.DoesNotExist: + return queryset.get(Q(id__icontains=snapshot_id)) + + +@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot") +def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema): + tags = normalize_tag_list(data.tags) + if data.status is not None and data.status not in Snapshot.StatusChoices.values: + raise HttpError(400, f"Invalid status: {data.status}") + if not data.url.strip(): + raise HttpError(400, "URL is required") + if data.depth not in (0, 1, 2, 3, 4): + raise HttpError(400, "depth must be between 0 and 4") + + if data.crawl_id: + crawl = Crawl.objects.get(id__icontains=data.crawl_id) + crawl_tags = normalize_tag_list(crawl.tags_str.split(",")) + tags = tags or crawl_tags + else: + crawl = Crawl.objects.create( + urls=data.url, + max_depth=max(data.depth, 0), + tags_str=",".join(tags), + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + created_by=request.user if isinstance(request.user, User) else None, + ) + + snapshot_defaults = { + "depth": data.depth, + "title": data.title, + "timestamp": str(timezone.now().timestamp()), + "status": data.status or Snapshot.StatusChoices.QUEUED, + "retry_at": timezone.now(), + } + snapshot, _ = Snapshot.objects.get_or_create( + url=data.url, + crawl=crawl, + defaults=snapshot_defaults, + ) + + update_fields: list[str] = [] + if data.title is not None and snapshot.title != data.title: + snapshot.title = data.title + update_fields.append("title") + if data.status is not None and snapshot.status != data.status: + if data.status not in Snapshot.StatusChoices.values: + raise HttpError(400, f"Invalid status: {data.status}") + snapshot.status = data.status + update_fields.append("status") + if update_fields: + update_fields.append("modified_at") + snapshot.save(update_fields=update_fields) + + if tags: + snapshot.save_tags(tags) + + try: + snapshot.ensure_crawl_symlink() + except Exception: + pass + + setattr(request, "with_archiveresults", False) + return snapshot + + +@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot") +def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateSchema): + """Update a snapshot (e.g., set status=sealed to cancel queued work).""" + try: + snapshot = Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id)) + except Snapshot.DoesNotExist: + snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id)) + + payload = data.dict(exclude_unset=True) + update_fields = ["modified_at"] + tags = payload.pop("tags", None) + + if "status" in payload: + if payload["status"] not in Snapshot.StatusChoices.values: + raise HttpError(400, f"Invalid status: {payload['status']}") + snapshot.status = payload["status"] + if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload: + snapshot.retry_at = None + update_fields.append("status") + + if "retry_at" in payload: + snapshot.retry_at = payload["retry_at"] + update_fields.append("retry_at") + + if tags is not None: + snapshot.save_tags(normalize_tag_list(tags)) + + snapshot.save(update_fields=update_fields) + setattr(request, "with_archiveresults", False) + return snapshot + + +@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot") +def delete_snapshot(request: HttpRequest, snapshot_id: str): + snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False) + snapshot_id_str = str(snapshot.id) + crawl_id_str = str(snapshot.crawl.pk) + deleted_count, _ = snapshot.delete() + return { + "success": True, + "snapshot_id": snapshot_id_str, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, + } + + +### Tag ######################################################################### + + +class TagSchema(Schema): + TYPE: str = "core.models.Tag" + id: int + modified_at: datetime + created_at: datetime + created_by_id: str + created_by_username: str + name: str + slug: str + num_snapshots: int + snapshots: list[SnapshotSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + user_model = get_user_model() + user = user_model.objects.get(id=obj.created_by_id) + username = getattr(user, "username", None) + return username if isinstance(username, str) else str(user) + + @staticmethod + def resolve_num_snapshots(obj, context): + return obj.snapshot_set.all().distinct().count() + + @staticmethod + def resolve_snapshots(obj, context): + if bool(getattr(context["request"], "with_snapshots", False)): + return obj.snapshot_set.all().distinct() + return Snapshot.objects.none() + + +@router.get("/tags", response=list[TagSchema], url_name="get_tags") +@paginate(CustomPagination) +def get_tags(request: HttpRequest): + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) + return get_matching_tags() + + +@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") +def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True): + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", False) + try: + return get_tag_by_ref(tag_id) + except (Tag.DoesNotExist, ValidationError): + raise HttpError(404, "Tag not found") + + +@router.get( + "/any/{id}", + response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], + url_name="get_any", + summary="Get any object by its ID", +) +def get_any(request: HttpRequest, id: str): + """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.).""" + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) + + for getter in [get_snapshot, get_archiveresult, get_tag]: + try: + response = getter(request, id) + if isinstance(response, Model): + return redirect( + f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}", + ) + except Exception: + pass + + try: + from archivebox.api.v1_crawls import get_crawl + + response = get_crawl(request, id) + if isinstance(response, Model): + return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") + except Exception: + pass + + raise HttpError(404, "Object with given ID not found") + + +### Tag Editor API Endpoints ######################################################################### + + +class TagAutocompleteSchema(Schema): + tags: list[dict] + + +class TagCreateSchema(Schema): + name: str + + +class TagCreateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + created: bool + + +class TagSearchSnapshotSchema(Schema): + id: str + title: str + url: str + favicon_url: str + admin_url: str + archive_url: str + downloaded_at: str | None = None + + +class TagSearchCardSchema(Schema): + id: int + name: str + slug: str + num_snapshots: int + filter_url: str + edit_url: str + export_urls_url: str + export_jsonl_url: str + rename_url: str + delete_url: str + snapshots: list[TagSearchSnapshotSchema] + + +class TagSearchResponseSchema(Schema): + tags: list[TagSearchCardSchema] + sort: str + created_by: str + year: str + has_snapshots: str + + +class TagUpdateSchema(Schema): + name: str + + +class TagUpdateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + slug: str + + +class TagDeleteResponseSchema(Schema): + success: bool + tag_id: int + deleted_count: int + + +class TagSnapshotRequestSchema(Schema): + snapshot_id: str + tag_name: str | None = None + tag_id: int | None = None + + +class TagSnapshotResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + + +@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags") +def search_tags( + request: HttpRequest, + q: str = "", + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +): + """Return detailed tag cards for admin/live-search UIs.""" + normalized_sort = normalize_tag_sort(sort) + normalized_created_by = normalize_created_by_filter(created_by) + normalized_year = normalize_created_year_filter(year) + normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots) + return { + "tags": build_tag_cards( + query=q, + request=request, + sort=normalized_sort, + created_by=normalized_created_by, + year=normalized_year, + has_snapshots=normalized_has_snapshots, + ), + "sort": normalized_sort, + "created_by": normalized_created_by, + "year": normalized_year, + "has_snapshots": normalized_has_snapshots, + } + + +def _public_tag_listing_enabled() -> bool: + explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None) + if explicit is not None: + return bool(explicit) + return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX)) + + +def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool: + user = getattr(request, "user", None) + if getattr(user, "is_authenticated", False): + return True + + token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key") + auth_header = request.headers.get("Authorization", "") + if not token and auth_header.lower().startswith("bearer "): + token = auth_header.split(None, 1)[1].strip() + + if token and auth_using_token(token=token, request=request): + return True + + return _public_tag_listing_enabled() + + +@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None) +def tags_autocomplete(request: HttpRequest, q: str = ""): + """Return tags matching the query for autocomplete.""" + if not _request_has_tag_autocomplete_access(request): + raise HttpError(401, "Authentication required") + + tags = get_matching_tags(q)[: 50 if not q else 20] + + return { + "tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags], + } + + +@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create") +def tags_create(request: HttpRequest, data: TagCreateSchema): + """Create a new tag or return existing one.""" + try: + tag, created = get_or_create_tag( + data.name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + "created": created, + } + + +@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag") +def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema): + try: + tag = rename_tag_record(get_tag_by_ref(tag_id), data.name) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + except ValueError as err: + raise HttpError(400, str(err)) from err + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + "slug": tag.slug, + } + + +@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag") +def delete_tag(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + deleted_count, _ = delete_tag_record(tag) + return { + "success": True, + "tag_id": int(tag_id), + "deleted_count": deleted_count, + } + + +@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export") +def tag_urls_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"' + return response + + +@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export") +def tag_snapshots_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"' + return response + + +@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot") +def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): + """Add a tag to a snapshot. Creates the tag if it doesn't exist.""" + # Get the snapshot + try: + snapshot = Snapshot.objects.get( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), + ) + except Snapshot.DoesNotExist: + raise HttpError(404, "Snapshot not found") + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), + ).first() + if snapshot is None: + raise HttpError(404, "Snapshot not found") + + # Get or create the tag + if data.tag_name: + try: + tag, _ = get_or_create_tag( + data.tag_name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err + elif data.tag_id: + try: + tag = get_tag_by_ref(data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + else: + raise HttpError(400, "Either tag_name or tag_id is required") + + # Add the tag to the snapshot + snapshot.tags.add(tag.pk) + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + } + + +@router.post("/tags/remove-from-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_remove_from_snapshot") +def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): + """Remove a tag from a snapshot.""" + # Get the snapshot + try: + snapshot = Snapshot.objects.get( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), + ) + except Snapshot.DoesNotExist: + raise HttpError(404, "Snapshot not found") + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id), + ).first() + if snapshot is None: + raise HttpError(404, "Snapshot not found") + + # Get the tag + if data.tag_id: + try: + tag = Tag.objects.get(pk=data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + elif data.tag_name: + try: + tag = Tag.objects.get(name__iexact=data.tag_name.strip()) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + else: + raise HttpError(400, "Either tag_name or tag_id is required") + + # Remove the tag from the snapshot + snapshot.tags.remove(tag.pk) + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + } diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py new file mode 100644 index 0000000000..a925ff1815 --- /dev/null +++ b/archivebox/api/v1_crawls.py @@ -0,0 +1,201 @@ +__package__ = "archivebox.api" + +from uuid import UUID +from datetime import datetime +from django.http import HttpRequest +from django.utils import timezone + +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User + +from ninja import Router, Schema +from ninja.errors import HttpError + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + +from .auth import API_AUTH_METHODS + +router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS) + + +class CrawlSchema(Schema): + TYPE: str = "crawls.models.Crawl" + + id: UUID + + modified_at: datetime + created_at: datetime + created_by_id: str + created_by_username: str + + status: str + retry_at: datetime | None + + urls: str + max_depth: int + max_urls: int + max_size: int + tags_str: str + config: dict + + # snapshots: List[SnapshotSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + user_model = get_user_model() + user = user_model.objects.get(id=obj.created_by_id) + username = getattr(user, "username", None) + return username if isinstance(username, str) else str(user) + + @staticmethod + def resolve_snapshots(obj, context): + if bool(getattr(context["request"], "with_snapshots", False)): + return obj.snapshot_set.all().distinct() + return Snapshot.objects.none() + + +class CrawlUpdateSchema(Schema): + status: str | None = None + retry_at: datetime | None = None + tags: list[str] | None = None + tags_str: str | None = None + + +class CrawlCreateSchema(Schema): + urls: list[str] + max_depth: int = 0 + max_urls: int = 0 + max_size: int = 0 + tags: list[str] | None = None + tags_str: str = "" + label: str = "" + notes: str = "" + config: dict = {} + + +class CrawlDeleteResponseSchema(Schema): + success: bool + crawl_id: str + deleted_count: int + deleted_snapshots: int + + +def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]: + if tags is not None: + return [tag.strip() for tag in tags if tag and tag.strip()] + return [tag.strip() for tag in tags_str.split(",") if tag.strip()] + + +@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls") +def get_crawls(request: HttpRequest): + return Crawl.objects.all().distinct() + + +@router.post("/crawls", response=CrawlSchema, url_name="create_crawl") +def create_crawl(request: HttpRequest, data: CrawlCreateSchema): + urls = [url.strip() for url in data.urls if url and url.strip()] + if not urls: + raise HttpError(400, "At least one URL is required") + if data.max_depth not in (0, 1, 2, 3, 4): + raise HttpError(400, "max_depth must be between 0 and 4") + if data.max_urls < 0: + raise HttpError(400, "max_urls must be >= 0") + if data.max_size < 0: + raise HttpError(400, "max_size must be >= 0") + + tags = normalize_tag_list(data.tags, data.tags_str) + crawl = Crawl.objects.create( + urls="\n".join(urls), + max_depth=data.max_depth, + max_urls=data.max_urls, + max_size=data.max_size, + tags_str=",".join(tags), + label=data.label, + notes=data.notes, + config=data.config, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + created_by=request.user if isinstance(request.user, User) else None, + ) + crawl.create_snapshots_from_urls() + return crawl + + +@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl") +def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False): + """Get a specific Crawl by id.""" + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", with_archiveresults) + crawl = Crawl.objects.get(id__icontains=crawl_id) + + if crawl and as_rss: + # return snapshots as XML rss feed + urls = [ + {"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str} + for snapshot in crawl.snapshot_set.all() + ] + xml = '' + for url in urls: + xml += f"{url['url']}{url['title']}{url['bookmarked_at']}{url['tags']}" + xml += "" + return xml + + return crawl + + +@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl") +def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema): + """Update a crawl (e.g., set status=sealed to cancel queued work).""" + crawl = Crawl.objects.get(id__icontains=crawl_id) + payload = data.dict(exclude_unset=True) + update_fields = ["modified_at"] + + tags = payload.pop("tags", None) + tags_str = payload.pop("tags_str", None) + if tags is not None or tags_str is not None: + crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or "")) + update_fields.append("tags_str") + + if "status" in payload: + if payload["status"] not in Crawl.StatusChoices.values: + raise HttpError(400, f"Invalid status: {payload['status']}") + crawl.status = payload["status"] + if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload: + crawl.retry_at = None + update_fields.append("status") + + if "retry_at" in payload: + crawl.retry_at = payload["retry_at"] + update_fields.append("retry_at") + + crawl.save(update_fields=update_fields) + + if payload.get("status") == Crawl.StatusChoices.SEALED: + Snapshot.objects.filter( + crawl=crawl, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).update( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + return crawl + + +@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl") +def delete_crawl(request: HttpRequest, crawl_id: str): + crawl = Crawl.objects.get(id__icontains=crawl_id) + crawl_id_str = str(crawl.id) + snapshot_count = crawl.snapshot_set.count() + deleted_count, _ = crawl.delete() + return { + "success": True, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, + "deleted_snapshots": snapshot_count, + } diff --git a/archivebox/api/v1_machine.py b/archivebox/api/v1_machine.py new file mode 100644 index 0000000000..e18dbe48fe --- /dev/null +++ b/archivebox/api/v1_machine.py @@ -0,0 +1,161 @@ +__package__ = "archivebox.api" + +from uuid import UUID +from typing import Annotated +from datetime import datetime + +from django.http import HttpRequest + +from ninja import FilterLookup, FilterSchema, Query, Router, Schema +from ninja.pagination import paginate + +from archivebox.api.v1_core import CustomPagination + + +router = Router(tags=["Machine and Dependencies"]) + + +# ============================================================================ +# Machine Schemas +# ============================================================================ + + +class MachineSchema(Schema): + """Schema for Machine model.""" + + TYPE: str = "machine.Machine" + id: UUID + created_at: datetime + modified_at: datetime + guid: str + hostname: str + hw_in_docker: bool + hw_in_vm: bool + hw_manufacturer: str + hw_product: str + hw_uuid: str + os_arch: str + os_family: str + os_platform: str + os_release: str + os_kernel: str + stats: dict + num_uses_succeeded: int + num_uses_failed: int + + +class MachineFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup("id__startswith")] = None + hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None + os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None + os_arch: Annotated[str | None, FilterLookup("os_arch")] = None + hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None + hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None + bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None + + +# ============================================================================ +# Binary Schemas +# ============================================================================ + + +class BinarySchema(Schema): + """Schema for Binary model.""" + + TYPE: str = "machine.Binary" + id: UUID + created_at: datetime + modified_at: datetime + machine_id: UUID + machine_hostname: str + name: str + binproviders: str + binprovider: str + abspath: str + version: str + sha256: str + status: str + is_valid: bool + num_uses_succeeded: int + num_uses_failed: int + + @staticmethod + def resolve_machine_hostname(obj) -> str: + return obj.machine.hostname + + @staticmethod + def resolve_is_valid(obj) -> bool: + return obj.is_valid + + +class BinaryFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup("id__startswith")] = None + name: Annotated[str | None, FilterLookup("name__icontains")] = None + binprovider: Annotated[str | None, FilterLookup("binprovider")] = None + status: Annotated[str | None, FilterLookup("status")] = None + machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None + version: Annotated[str | None, FilterLookup("version__icontains")] = None + + +# ============================================================================ +# Machine Endpoints +# ============================================================================ + + +@router.get("/machines", response=list[MachineSchema], url_name="get_machines") +@paginate(CustomPagination) +def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]): + """List all machines.""" + from archivebox.machine.models import Machine + + return filters.filter(Machine.objects.all()).distinct() + + +@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine") +def get_current_machine(request: HttpRequest): + """Get the current machine.""" + from archivebox.machine.models import Machine + + return Machine.current() + + +@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine") +def get_machine(request: HttpRequest, machine_id: str): + """Get a specific machine by ID.""" + from archivebox.machine.models import Machine + from django.db.models import Q + + return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id)) + + +# ============================================================================ + + +# ============================================================================ +# Binary Endpoints +# ============================================================================ + + +@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries") +@paginate(CustomPagination) +def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]): + """List all binaries.""" + from archivebox.machine.models import Binary + + return filters.filter(Binary.objects.all().select_related("machine")).distinct() + + +@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary") +def get_binary(request: HttpRequest, binary_id: str): + """Get a specific binary by ID.""" + from archivebox.machine.models import Binary + + return Binary.objects.select_related("machine").get(id__startswith=binary_id) + + +@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name") +def get_binaries_by_name(request: HttpRequest, name: str): + """Get all binaries with the given name.""" + from archivebox.machine.models import Binary + + return list(Binary.objects.filter(name__iexact=name).select_related("machine")) diff --git a/archivebox/base_models/__init__.py b/archivebox/base_models/__init__.py new file mode 100644 index 0000000000..7c4b68536c --- /dev/null +++ b/archivebox/base_models/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.base_models" diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py new file mode 100644 index 0000000000..d6703b8273 --- /dev/null +++ b/archivebox/base_models/admin.py @@ -0,0 +1,726 @@ +"""Base admin classes for models using UUIDv7.""" + +__package__ = "archivebox.base_models" + +import json +from collections.abc import Mapping +from typing import NotRequired, TypedDict + +from django import forms +from django.contrib import admin +from django.db import models +from django.forms.renderers import BaseRenderer +from django.http import HttpRequest, QueryDict +from django.utils.safestring import SafeString, mark_safe +from django_object_actions import DjangoObjectActions + + +class ConfigOption(TypedDict): + plugin: str + type: str | list[str] + default: object + description: str + enum: NotRequired[list[object]] + pattern: NotRequired[str] + minimum: NotRequired[int | float] + maximum: NotRequired[int | float] + + +class KeyValueWidget(forms.Widget): + """ + A widget that renders JSON dict as editable key-value input fields + with + and - buttons to add/remove rows. + Includes autocomplete for available config keys from the plugin system. + """ + + template_name = "" # We render manually + + class Media: + css = { + "all": [], + } + js = [] + + def _get_config_options(self) -> dict[str, ConfigOption]: + """Get available config options from plugins.""" + try: + from archivebox.hooks import discover_plugin_configs + + plugin_configs = discover_plugin_configs() + options: dict[str, ConfigOption] = {} + for plugin_name, schema in plugin_configs.items(): + for key, prop in schema.get("properties", {}).items(): + option: ConfigOption = { + "plugin": plugin_name, + "type": prop.get("type", "string"), + "default": prop.get("default", ""), + "description": prop.get("description", ""), + } + for schema_key in ("enum", "pattern", "minimum", "maximum"): + if schema_key in prop: + option[schema_key] = prop[schema_key] + options[key] = option + return options + except Exception: + return {} + + def _parse_value(self, value: object) -> dict[str, object]: + # Parse JSON value to dict + if value is None: + return {} + if isinstance(value, str): + try: + parsed = json.loads(value) if value else {} + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + if isinstance(value, Mapping): + return {str(key): item for key, item in value.items()} + return {} + + def render( + self, + name: str, + value: object, + attrs: Mapping[str, str] | None = None, + renderer: BaseRenderer | None = None, + ) -> SafeString: + data = self._parse_value(value) + + widget_id = attrs.get("id", name) if attrs else name + config_options = self._get_config_options() + + # Build datalist options + datalist_options = "\n".join( + f'' + for key, opt in sorted(config_options.items()) + ) + + # Build config metadata as JSON for JS + config_meta_json = json.dumps(config_options) + + html = f''' +
+ + {datalist_options} + +
+ ''' + + # Render existing key-value pairs + for key, val in data.items(): + val_str = json.dumps(val) if not isinstance(val, str) else val + html += self._render_row(widget_id, key, val_str) + + # Always add one empty row for new entries + html += self._render_row(widget_id, "", "") + + html += f''' +
+
+ +
+ + +
+ ''' + return mark_safe(html) + + def _render_row(self, widget_id: str, key: str, value: str) -> str: + return f''' +
+
+ + + + +
+
+
+ ''' + + def _escape(self, s: object) -> str: + """Escape HTML special chars in attribute values.""" + if not s: + return "" + return str(s).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) + + def value_from_datadict( + self, + data: QueryDict | Mapping[str, object], + files: object, + name: str, + ) -> str: + value = data.get(name, "{}") + return value if isinstance(value, str) else "{}" + + +class ConfigEditorMixin(admin.ModelAdmin): + """ + Mixin for admin classes with a config JSON field. + + Provides a key-value editor widget with autocomplete for available config keys. + """ + + def formfield_for_dbfield( + self, + db_field: models.Field[object, object], + request: HttpRequest, + **kwargs: object, + ) -> forms.Field | None: + """Use KeyValueWidget for the config JSON field.""" + if db_field.name == "config": + kwargs["widget"] = KeyValueWidget() + return super().formfield_for_dbfield(db_field, request, **kwargs) + + +class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin): + list_display = ("id", "created_at", "created_by") + readonly_fields = ("id", "created_at", "modified_at") + show_search_mode_selector = False + + def get_default_search_mode(self) -> str: + # The shared changelist template always asks every admin for a default + # search mode, even when the search-mode toggle is hidden. + return "meta" + + def get_form( + self, + request: HttpRequest, + obj: models.Model | None = None, + change: bool = False, + **kwargs: object, + ): + form = super().get_form(request, obj, change=change, **kwargs) + if "created_by" in form.base_fields: + form.base_fields["created_by"].initial = request.user + return form diff --git a/archivebox/base_models/apps.py b/archivebox/base_models/apps.py new file mode 100644 index 0000000000..82bd72f8bf --- /dev/null +++ b/archivebox/base_models/apps.py @@ -0,0 +1,7 @@ +# from django.apps import AppConfig + + +# class BaseModelsConfig(AppConfig): +# default_auto_field = 'django.db.models.BigAutoField' + +# name = 'base_models' diff --git a/archivebox/vendor/__init__.py b/archivebox/base_models/migrations/__init__.py similarity index 100% rename from archivebox/vendor/__init__.py rename to archivebox/base_models/migrations/__init__.py diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py new file mode 100755 index 0000000000..e6913a9c74 --- /dev/null +++ b/archivebox/base_models/models.py @@ -0,0 +1,134 @@ +"""Base models using UUIDv7 for all id fields.""" + +__package__ = "archivebox.base_models" + +from archivebox.uuid_compat import uuid7 +from pathlib import Path + +from django.db import models +from django.db.models import F +from django.utils import timezone +from django.contrib.auth import get_user_model +from django.urls import reverse_lazy +from django.conf import settings + +from django_stubs_ext.db.models import TypedModelMeta + + +def get_or_create_system_user_pk(username="system"): + User = get_user_model() + # If there's exactly one superuser, use that for all system operations + if User.objects.filter(is_superuser=True).count() == 1: + return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0] + # Otherwise get or create the system user + user, _ = User.objects.get_or_create( + username=username, + defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"}, + ) + return user.pk + + +class AutoDateTimeField(models.DateTimeField): + """DateTimeField that automatically updates on save (legacy compatibility).""" + + def pre_save(self, model_instance, add): + if add or not getattr(model_instance, self.attname): + value = timezone.now() + setattr(model_instance, self.attname, value) + return value + return super().pre_save(model_instance, add) + + +class ModelWithUUID(models.Model): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=False, + db_index=True, + ) + + class Meta(TypedModelMeta): + abstract = True + + def __str__(self) -> str: + return f"[{self.id}] {self.__class__.__name__}" + + @property + def admin_change_url(self) -> str: + return f"/admin/{self._meta.app_label}/{self._meta.model_name}/{self.pk}/change/" + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_any", args=[self.id])) + + @property + def api_docs_url(self) -> str: + return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}" + + +class ModelWithNotes(models.Model): + """Mixin for models with a notes field.""" + + notes = models.TextField(blank=True, null=False, default="") + + class Meta(TypedModelMeta): + abstract = True + + +class ModelWithHealthStats(models.Model): + """Mixin for models with health tracking fields.""" + + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + class Meta(TypedModelMeta): + abstract = True + + @property + def health(self) -> int: + total = max(self.num_uses_failed + self.num_uses_succeeded, 1) + return round((self.num_uses_succeeded / total) * 100) + + def increment_health_stats(self, success: bool): + """Atomically increment success or failure counter using F() expression.""" + field = "num_uses_succeeded" if success else "num_uses_failed" + type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1}) + + +class ModelWithConfig(models.Model): + """Mixin for models with a JSON config field.""" + + config = models.JSONField(default=dict, null=True, blank=True, editable=True) + + class Meta(TypedModelMeta): + abstract = True + + +class ModelWithOutputDir(ModelWithUUID): + class Meta(ModelWithUUID.Meta): + abstract = True + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) + # Note: index.json is deprecated, models should use write_index_jsonl() for full data + + @property + def output_dir_parent(self) -> str: + return f"{self._meta.model_name}s" + + @property + def output_dir_name(self) -> str: + return str(self.id) + + @property + def output_dir_str(self) -> str: + return f"{self.output_dir_parent}/{self.output_dir_name}" + + @property + def output_dir(self) -> Path: + raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property") diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 9622c98ffc..13a62c4f1d 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,155 +1,199 @@ -__package__ = 'archivebox.cli' -__command__ = 'archivebox' - +__package__ = "archivebox.cli" +__command__ = "archivebox" import os import sys -import argparse - -from typing import Optional, Dict, List, IO, Union -from pathlib import Path - -from ..config import OUTPUT_DIR, check_data_folder, check_migrations - from importlib import import_module -CLI_DIR = Path(__file__).resolve().parent - -# these common commands will appear sorted before any others for ease-of-use -meta_cmds = ('help', 'version') # dont require valid data folder at all -main_cmds = ('init', 'config', 'setup') # dont require existing db present -archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present -fake_db = ("oneshot",) # use fake in-memory db - -display_first = (*meta_cmds, *main_cmds, *archive_cmds) - -# every imported command module must have these properties in order to be valid -required_attrs = ('__package__', '__command__', 'main') - -# basic checks to make sure imported files are valid subcommands -is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py') -is_valid_cli_module = lambda module, subcommand: ( - all(hasattr(module, attr) for attr in required_attrs) - and module.__command__.split(' ')[-1] == subcommand -) - - -def list_subcommands() -> Dict[str, str]: - """find and import all valid archivebox_.py files in CLI_DIR""" - - COMMANDS = [] - for filename in os.listdir(CLI_DIR): - if is_cli_module(filename): - subcommand = filename.replace('archivebox_', '').replace('.py', '') - module = import_module('.archivebox_{}'.format(subcommand), __package__) - assert is_valid_cli_module(module, subcommand) - COMMANDS.append((subcommand, module.main.__doc__)) - globals()[subcommand] = module.main - - display_order = lambda cmd: ( - display_first.index(cmd[0]) - if cmd[0] in display_first else - 100 + len(cmd[0]) - ) - - return dict(sorted(COMMANDS, key=display_order)) - - -def run_subcommand(subcommand: str, - subcommand_args: List[str]=None, - stdin: Optional[IO]=None, - pwd: Union[Path, str, None]=None) -> None: - """Run a given ArchiveBox subcommand with the given list of args""" - - subcommand_args = subcommand_args or [] - - if subcommand not in meta_cmds: - from ..config import setup_django - - cmd_requires_db = subcommand in archive_cmds - init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args - - if cmd_requires_db: - check_data_folder(pwd) - - setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) - - if cmd_requires_db: - check_migrations() - - module = import_module('.archivebox_{}'.format(subcommand), __package__) - module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore - - -SUBCOMMANDS = list_subcommands() - -class NotProvided: - pass - - -def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None: - args = sys.argv[1:] if args is NotProvided else args - stdin = sys.stdin if stdin is NotProvided else stdin - - subcommands = list_subcommands() - parser = argparse.ArgumentParser( - prog=__command__, - description='ArchiveBox: The self-hosted internet archive', - add_help=False, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--help', '-h', - action='store_true', - help=subcommands['help'], - ) - group.add_argument( - '--version', - action='store_true', - help=subcommands['version'], - ) - group.add_argument( - "subcommand", - type=str, - help= "The name of the subcommand to run", - nargs='?', - choices=subcommands.keys(), - default=None, - ) - parser.add_argument( - "subcommand_args", - help="Arguments for the subcommand", - nargs=argparse.REMAINDER, - ) - command = parser.parse_args(args or ()) - - if command.version: - command.subcommand = 'version' - elif command.help or command.subcommand is None: - command.subcommand = 'help' - - if command.subcommand not in ('help', 'version', 'status'): - from ..logging_util import log_cli_command - - log_cli_command( - subcommand=command.subcommand, - subcommand_args=command.subcommand_args, - stdin=stdin, - pwd=pwd or OUTPUT_DIR - ) - - run_subcommand( - subcommand=command.subcommand, - subcommand_args=command.subcommand_args, - stdin=stdin, - pwd=pwd or OUTPUT_DIR, - ) - - -__all__ = ( - 'SUBCOMMANDS', - 'list_subcommands', - 'run_subcommand', - *SUBCOMMANDS.keys(), -) - - +import rich_click as click +from rich import print + +from archivebox.config.version import VERSION + + +if "--debug" in sys.argv: + os.environ["DEBUG"] = "True" + sys.argv.remove("--debug") + + +class ArchiveBoxGroup(click.Group): + """lazy loading click group for archivebox commands""" + + meta_commands = { + "help": "archivebox.cli.archivebox_help.main", + "version": "archivebox.cli.archivebox_version.main", + "mcp": "archivebox.cli.archivebox_mcp.main", + } + setup_commands = { + "init": "archivebox.cli.archivebox_init.main", + "install": "archivebox.cli.archivebox_install.main", + } + # Model commands (CRUD operations via subcommands) + model_commands = { + "crawl": "archivebox.cli.archivebox_crawl.main", + "snapshot": "archivebox.cli.archivebox_snapshot.main", + "archiveresult": "archivebox.cli.archivebox_archiveresult.main", + "tag": "archivebox.cli.archivebox_tag.main", + "binary": "archivebox.cli.archivebox_binary.main", + "process": "archivebox.cli.archivebox_process.main", + "machine": "archivebox.cli.archivebox_machine.main", + "persona": "archivebox.cli.archivebox_persona.main", + } + archive_commands = { + # High-level commands + "add": "archivebox.cli.archivebox_add.main", + "extract": "archivebox.cli.archivebox_extract.main", + "list": "archivebox.cli.archivebox_list.main", + "remove": "archivebox.cli.archivebox_remove.main", + "run": "archivebox.cli.archivebox_run.main", + "update": "archivebox.cli.archivebox_update.main", + "status": "archivebox.cli.archivebox_status.main", + "search": "archivebox.cli.archivebox_search.main", + "config": "archivebox.cli.archivebox_config.main", + "schedule": "archivebox.cli.archivebox_schedule.main", + "server": "archivebox.cli.archivebox_server.main", + "shell": "archivebox.cli.archivebox_shell.main", + "manage": "archivebox.cli.archivebox_manage.main", + # Introspection commands + "pluginmap": "archivebox.cli.archivebox_pluginmap.main", + } + legacy_model_commands = { + "crawl": "archivebox.cli.archivebox_crawl_compat.main", + "snapshot": "archivebox.cli.archivebox_snapshot_compat.main", + } + all_subcommands = { + **meta_commands, + **setup_commands, + **model_commands, + **archive_commands, + } + renamed_commands = { + "setup": "install", + "import": "add", + "archive": "add", + } + legacy_model_subcommands = { + "crawl": {"create", "list", "update", "delete"}, + "snapshot": {"create", "list", "update", "delete"}, + } + + @classmethod + def get_canonical_name(cls, cmd_name): + return cls.renamed_commands.get(cmd_name, cmd_name) + + @classmethod + def _should_use_legacy_model_command(cls, cmd_name: str) -> bool: + if cmd_name not in cls.legacy_model_commands: + return False + + try: + arg_idx = sys.argv.index(cmd_name) + except ValueError: + return False + + remaining_args = sys.argv[arg_idx + 1 :] + if not remaining_args: + return False + + first_arg = remaining_args[0] + if first_arg in ("-h", "--help"): + return False + + return first_arg not in cls.legacy_model_subcommands[cmd_name] + + def get_command(self, ctx, cmd_name): + # handle renamed commands + if cmd_name in self.renamed_commands: + new_name = self.renamed_commands[cmd_name] + print( + f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`", + file=sys.stderr, + ) + cmd_name = new_name + ctx.invoked_subcommand = cmd_name + + if self._should_use_legacy_model_command(cmd_name): + return self._lazy_load(self.legacy_model_commands[cmd_name]) + + # handle lazy loading of commands + if cmd_name in self.all_subcommands: + return self._lazy_load(cmd_name) + + # fall-back to using click's default command lookup + return super().get_command(ctx, cmd_name) + + @classmethod + def _lazy_load(cls, cmd_name_or_path): + import_path = cls.all_subcommands.get(cmd_name_or_path) + if import_path is None: + import_path = cmd_name_or_path + modname, funcname = import_path.rsplit(".", 1) + + # print(f'LAZY LOADING {import_path}') + mod = import_module(modname) + func = getattr(mod, funcname) + + if not hasattr(func, "__doc__"): + raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method") + + # if not isinstance(cmd, click.BaseCommand): + # raise ValueError(f'lazy loading of {import_path} failed - not a click command') + + return func + + +@click.group(cls=ArchiveBoxGroup, invoke_without_command=True) +@click.option("--help", "-h", is_flag=True, help="Show help") +@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s") +@click.pass_context +def cli(ctx, help=False): + """ArchiveBox: The self-hosted internet archive""" + + subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand) + + # if --help is passed or no subcommand is given, show custom help message + if help or ctx.invoked_subcommand is None: + ctx.invoke(ctx.command.get_command(ctx, "help")) + + # if the subcommand is in archive_commands or model_commands, + # then we need to set up the django environment and check that we're in a valid data folder + if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands: + # print('SETUP DJANGO AND CHECK DATA FOLDER') + try: + if subcommand == "server": + run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes") + if run_in_debug: + os.environ["ARCHIVEBOX_RUNSERVER"] = "1" + if "--reload" in sys.argv: + os.environ["ARCHIVEBOX_AUTORELOAD"] = "1" + from archivebox.config.common import STORAGE_CONFIG + + os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid") + + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + + setup_django() + check_data_folder() + except Exception as e: + print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr) + if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand + raise + + +def main(args=None, prog_name=None, stdin=None): + # show `docker run archivebox xyz` in help messages if running in docker + IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + IS_TTY = sys.stdin.isatty() + prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox") + + # stdin param allows passing input data from caller (used by __main__.py) + # currently not used by click-based CLI, but kept for backwards compatibility + + try: + cli(args=args, prog_name=prog_name) + except KeyboardInterrupt: + print("\n\n[red][X] Got CTRL+C. Exiting...[/red]") + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 2c3d7ce384..ae41dae222 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -1,133 +1,286 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox add' +__package__ = "archivebox.cli" +__command__ = "archivebox add" import sys -import argparse +from pathlib import Path -from typing import List, Optional, IO +from typing import TYPE_CHECKING -from ..main import add -from ..util import docstring -from ..parsers import PARSERS -from ..config import OUTPUT_DIR, ONLY_NEW -from ..logging_util import SmartFormatter, accept_stdin, stderr +import rich_click as click +from django.utils import timezone +from django.db.models import QuerySet -@docstring(add.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=add.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--tag', '-t', - type=str, - default='', - help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3", - ) - parser.add_argument( - '--update-all', #'-n', - action='store_true', - default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links - help="Also retry previously skipped/failed links when adding new links", - ) - parser.add_argument( - '--index-only', #'-o', - action='store_true', - help="Add the links to the main index without archiving them", - ) - parser.add_argument( - 'urls', - nargs='*', - type=str, - default=None, - help=( - 'URLs or paths to archive e.g.:\n' - ' https://getpocket.com/users/USERNAME/feed/all\n' - ' https://example.com/some/rss/feed.xml\n' - ' https://example.com\n' - ' ~/Downloads/firefox_bookmarks_export.html\n' - ' ~/Desktop/sites_list.csv\n' - ) - ) - parser.add_argument( - "--depth", - action="store", - default=0, - choices=[0, 1], - type=int, - help="Recursively archive all linked pages up to this many hops away" - ) - parser.add_argument( - "--overwrite", - default=False, - action="store_true", - help="Re-archive URLs from scratch, overwriting any existing files" - ) - parser.add_argument( - "--init", #'-i', - action='store_true', - help="Init/upgrade the curent data directory before adding", - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - parser.add_argument( - "--parser", - type=str, - help="Parser used to read inputted URLs.", - default="auto", - choices=["auto", *PARSERS.keys()], +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.util import parse_filesize_to_bytes +from archivebox import CONSTANTS +from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG +from archivebox.config.permissions import USER, HOSTNAME + + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + +def _collect_input_urls(args: tuple[str, ...]) -> list[str]: + from archivebox.misc.jsonl import read_args_or_stdin + + urls: list[str] = [] + for record in read_args_or_stdin(args): + url = record.get("url") + if isinstance(url, str) and url: + urls.append(url) + + urls_field = record.get("urls") + if isinstance(urls_field, str): + for line in urls_field.splitlines(): + line = line.strip() + if line and not line.startswith("#"): + urls.append(line) + + return urls + + +@enforce_types +def add( + urls: str | list[str], + depth: int | str = 0, + max_urls: int = 0, + max_size: int | str = 0, + tag: str = "", + url_allowlist: str = "", + url_denylist: str = "", + parser: str = "auto", + plugins: str = "", + persona: str = "Default", + overwrite: bool = False, + update: bool | None = None, + index_only: bool = False, + bg: bool = False, + created_by_id: int | None = None, +) -> tuple["Crawl", QuerySet["Snapshot"]]: + """Add a new URL or list of URLs to your archive. + + The flow is: + 1. Save URLs to sources file + 2. Create Crawl with URLs and max_depth + 3. Crawl runner creates Snapshots from Crawl URLs (depth=0) + 4. Crawl runner runs parser extractors on root snapshots + 5. Parser extractors output to urls.jsonl + 6. URLs are added to Crawl.urls and child Snapshots are created + 7. Repeat until max_depth is reached + """ + + from rich import print + + depth = int(depth) + max_urls = int(max_urls or 0) + max_size = parse_filesize_to_bytes(max_size) + + if depth not in (0, 1, 2, 3, 4): + raise ValueError("Depth must be 0-4") + if max_urls < 0: + raise ValueError("max_urls must be >= 0") + if max_size < 0: + raise ValueError("max_size must be >= 0") + + # import models once django is set up + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.personas.models import Persona + from archivebox.misc.logging_util import printable_filesize + from archivebox.misc.system import get_dir_size + from archivebox.config.configset import get_config + from archivebox.services.runner import run_crawl + + created_by_id = created_by_id or get_or_create_system_user_pk() + started_at = timezone.now() + if update is None: + update = not ARCHIVING_CONFIG.ONLY_NEW + + # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt + sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt" + sources_file.parent.mkdir(parents=True, exist_ok=True) + sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls)) + + # 2. Create a new Crawl with inline URLs + cli_args = [*sys.argv] + if cli_args[0].lower().endswith("archivebox"): + cli_args[0] = "archivebox" + cmd_str = " ".join(cli_args) + + timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") + + # Read URLs directly into crawl + urls_content = sources_file.read_text() + persona_name = (persona or "Default").strip() or "Default" + plugins = plugins or str(get_config().get("PLUGINS") or "") + persona_obj, _ = Persona.objects.get_or_create(name=persona_name) + persona_obj.ensure_dirs() + + crawl = Crawl.objects.create( + urls=urls_content, + max_depth=depth, + max_urls=max_urls, + max_size=max_size, + tags_str=tag, + persona_id=persona_obj.id, + label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]", + created_by_id=created_by_id, + config={ + "ONLY_NEW": not update, + "INDEX_ONLY": index_only, + "OVERWRITE": overwrite, + "PLUGINS": plugins, + "DEFAULT_PERSONA": persona_name, + "PARSER": parser, + **({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}), + **({"URL_DENYLIST": url_denylist} if url_denylist else {}), + }, ) - command = parser.parse_args(args or ()) - urls = command.urls - stdin_urls = '' - if not urls: - stdin_urls = accept_stdin(stdin) + print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]") + first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else "" + print(f" [dim]First URL: {first_url}[/dim]") + + # 3. The CrawlMachine will create Snapshots from all URLs when started + # Parser extractors run on snapshots and discover more URLs + # Discovered URLs become child Snapshots (depth+1) + + if index_only: + # Just create the crawl but don't start processing + print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]") + # Create snapshots for all URLs in the crawl + for url in crawl.get_urls_list(): + snapshot, _ = Snapshot.objects.update_or_create( + crawl=crawl, + url=url, + defaults={ + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), + "timestamp": str(timezone.now().timestamp()), + "depth": 0, + }, + ) + if tag: + snapshot.save_tags(tag.split(",")) + snapshot.ensure_crawl_symlink() + return crawl, crawl.snapshot_set.all() + + if bg: + crawl.create_snapshots_from_urls() + + # 5. Start the crawl runner to process the queue + # The runner will: + # - Process Crawl -> create Snapshots from all URLs + # - Process Snapshots -> run extractors + # - Parser extractors discover new URLs -> create child Snapshots + # - Repeat until max_depth reached - if (stdin_urls and urls) or (not stdin and not urls): - stderr( - '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', - color='red', + if bg: + # Background mode: just queue work and return (background runner via server will pick it up) + print( + "[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]", ) - raise SystemExit(2) - add( - urls=stdin_urls or urls, - depth=command.depth, - tag=command.tag, - update_all=command.update_all, - index_only=command.index_only, - overwrite=command.overwrite, - init=command.init, - extractors=command.extract, - parser=command.parser, - out_dir=pwd or OUTPUT_DIR, - ) + else: + # Foreground mode: run full crawl runner until all work is done + print("[green]\\[*] Starting crawl runner to process crawl...[/green]") + run_crawl(str(crawl.id)) + + # Print summary for foreground runs + try: + crawl.refresh_from_db() + snapshots_count = crawl.snapshot_set.count() + try: + from django.db.models import Count, Sum + + totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size")) + total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0 + except Exception: + total_bytes, _, _ = get_dir_size(crawl.output_dir) + total_size = printable_filesize(total_bytes) + total_time = timezone.now() - started_at + total_seconds = int(total_time.total_seconds()) + mins, secs = divmod(total_seconds, 60) + hours, mins = divmod(mins, 60) + if hours: + duration_str = f"{hours}h {mins}m {secs}s" + elif mins: + duration_str = f"{mins}m {secs}s" + else: + duration_str = f"{secs}s" + + # Output dir relative to DATA_DIR + try: + rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR) + rel_output_str = f"./{rel_output}" + except Exception: + rel_output_str = str(crawl.output_dir) + + bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000" + if bind_addr.startswith("http://") or bind_addr.startswith("https://"): + base_url = bind_addr + else: + base_url = f"http://{bind_addr}" + admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/" + + print("\n[bold]crawl output saved to:[/bold]") + print(f" {rel_output_str}") + print(f" {admin_url}") + print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}") + print(f"[bold]total size:[/bold] {total_size}") + print(f"[bold]total time:[/bold] {duration_str}") + except Exception: + # Summary is best-effort; avoid failing the command if something goes wrong + pass + # 6. Return the list of Snapshots in this crawl + snapshots = crawl.snapshot_set.all() + return crawl, snapshots + + +@click.command() +@click.option( + "--depth", + "-d", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)") +@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)") +@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3") +@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl") +@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl") +@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)") +@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...") +@click.option("--persona", default="Default", help="Authentication profile to use when archiving") +@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously") +@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them") +@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now") +@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)") +@click.argument("urls", nargs=-1, type=click.Path()) +@docstring(add.__doc__) +def main(**kwargs): + """Add a new URL or list of URLs to your archive""" + + raw_urls = kwargs.pop("urls") + urls = _collect_input_urls(raw_urls) + if not urls: + raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.") + if int(kwargs.get("max_urls") or 0) < 0: + raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls") + try: + kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size")) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="--max-size") from err -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + add(urls=urls, **kwargs) -# TODO: Implement these -# -# parser.add_argument( -# '--mirror', #'-m', -# action='store_true', -# help='Archive an entire site (finding all linked pages below it on the same domain)', -# ) -# parser.add_argument( -# '--crawler', #'-r', -# choices=('depth_first', 'breadth_first'), -# help='Controls which crawler to use in order to find outlinks in a given page', -# default=None, -# ) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py new file mode 100644 index 0000000000..9c1eaf7bed --- /dev/null +++ b/archivebox/cli/archivebox_archiveresult.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 + +""" +archivebox archiveresult [args...] [--filters] + +Manage ArchiveResult records (plugin extraction results). + +Actions: + create - Create ArchiveResults for Snapshots (queue extractions) + list - List ArchiveResults as JSONL (with optional filters) + update - Update ArchiveResults from stdin JSONL + delete - Delete ArchiveResults from stdin JSONL + +Examples: + # Create ArchiveResults for snapshots (queue for extraction) + archivebox snapshot list --status=queued | archivebox archiveresult create + archivebox archiveresult create --plugin=screenshot --snapshot-id= + + # List with filters + archivebox archiveresult list --status=failed + archivebox archiveresult list --plugin=screenshot --status=succeeded + + # Update (reset failed extractions to queued) + archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued + + # Delete + archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes + + # Re-run failed extractions + archivebox archiveresult list --status=failed | archivebox run +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox archiveresult" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict: + return { + "type": "ArchiveResult", + "snapshot_id": str(snapshot_id), + "plugin": plugin, + "hook_name": hook_name, + "status": status, + } + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_archiveresults( + snapshot_id: str | None = None, + plugin: str | None = None, + status: str = "queued", +) -> int: + """ + Create ArchiveResult request records for Snapshots. + + Reads Snapshot records from stdin and emits ArchiveResult request JSONL. + Pass-through: Non-Snapshot/ArchiveResult records are output unchanged. + If --plugin is specified, only emits requests for that plugin. + Otherwise, emits requests for all enabled snapshot hooks. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.config.configset import get_config + from archivebox.hooks import discover_hooks + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.core.models import Snapshot + + is_tty = sys.stdout.isatty() + + # If snapshot_id provided directly, use that + if snapshot_id: + try: + snapshots = [Snapshot.objects.get(id=snapshot_id)] + pass_through_records = [] + except Snapshot.DoesNotExist: + rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr) + return 1 + else: + # Read from stdin + records = list(read_stdin()) + if not records: + rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Separate snapshot records from pass-through records + snapshot_ids = [] + pass_through_records = [] + + for record in records: + record_type = record.get("type", "") + + if record_type == TYPE_SNAPSHOT: + # Pass through the Snapshot record itself + pass_through_records.append(record) + if record.get("id"): + snapshot_ids.append(record["id"]) + + elif record_type == TYPE_ARCHIVERESULT: + # ArchiveResult records: pass through if they have an id + if record.get("id"): + pass_through_records.append(record) + # If no id, we could create it, but for now just pass through + else: + pass_through_records.append(record) + + elif record_type: + # Other typed records (Crawl, Tag, etc): pass through + pass_through_records.append(record) + + elif record.get("id"): + # Untyped record with id - assume it's a snapshot ID + snapshot_ids.append(record["id"]) + + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + + if not snapshot_ids: + if pass_through_records: + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr) + return 0 + rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr) + return 1 + + snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids)) + + if not snapshots: + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) + return 0 if pass_through_records else 1 + + created_count = 0 + for snapshot in snapshots: + if plugin: + if not is_tty: + write_record(build_archiveresult_request(snapshot.id, plugin, status=status)) + created_count += 1 + else: + config = get_config(crawl=snapshot.crawl, snapshot=snapshot) + hooks = discover_hooks("Snapshot", config=config) + for hook_path in hooks: + hook_name = hook_path.name + plugin_name = hook_path.parent.name + if not is_tty: + write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status)) + created_count += 1 + + rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_archiveresults( + status: str | None = None, + plugin: str | None = None, + snapshot_id: str | None = None, + limit: int | None = None, +) -> int: + """ + List ArchiveResults as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + queryset = ArchiveResult.objects.all().order_by("-start_ts") + + # Apply filters + filter_kwargs = { + "status": status, + "plugin": plugin, + "snapshot_id": snapshot_id, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for result in queryset: + if is_tty: + status_color = { + "queued": "yellow", + "started": "blue", + "succeeded": "green", + "failed": "red", + "skipped": "dim", + "noresults": "dim", + "backoff": "magenta", + }.get(result.status, "dim") + rprint( + f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}", + ) + else: + write_record(result.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_archiveresults( + status: str | None = None, +) -> int: + """ + Update ArchiveResults from stdin JSONL. + + Reads ArchiveResult records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + result_id = record.get("id") + if not result_id: + continue + + try: + result = ArchiveResult.objects.get(id=result_id) + + # Apply updates from CLI flags + if status: + result.status = status + + result.save() + updated_count += 1 + + if not is_tty: + write_record(result.to_json()) + + except ArchiveResult.DoesNotExist: + rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete ArchiveResults from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import ArchiveResult + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + result_ids = [r.get("id") for r in records if r.get("id")] + + if not result_ids: + rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr) + return 1 + + results = ArchiveResult.objects.filter(id__in=result_ids) + count = results.count() + + if count == 0: + rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr) + for result in results[:10]: + rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr) + if count > 10: + rprint(f" ... and {count - 10} more", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = results.delete() + rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage ArchiveResult records (plugin extraction results).""" + pass + + +@main.command("create") +@click.option("--snapshot-id", help="Snapshot ID to create results for") +@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +def create_cmd(snapshot_id: str | None, plugin: str | None, status: str): + """Create ArchiveResults for Snapshots from stdin JSONL.""" + sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status)) + + +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)") +@click.option("--plugin", "-p", help="Filter by plugin name") +@click.option("--snapshot-id", help="Filter by snapshot ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + plugin: str | None, + snapshot_id: str | None, + limit: int | None, +): + """List ArchiveResults as JSONL.""" + sys.exit( + list_archiveresults( + status=status, + plugin=plugin, + snapshot_id=snapshot_id, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +def update_cmd(status: str | None): + """Update ArchiveResults from stdin JSONL.""" + sys.exit(update_archiveresults(status=status)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete ArchiveResults from stdin JSONL.""" + sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py new file mode 100644 index 0000000000..d156d8cca5 --- /dev/null +++ b/archivebox/cli/archivebox_binary.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +""" +archivebox binary [args...] [--filters] + +Manage Binary records (detected executables like chrome, wget, etc.). + +Actions: + create - Create/register a Binary + list - List Binaries as JSONL (with optional filters) + update - Update Binaries from stdin JSONL + delete - Delete Binaries from stdin JSONL + +Examples: + # List all binaries + archivebox binary list + + # List specific binary + archivebox binary list --name=chrome + + # List binaries with specific version + archivebox binary list --version__icontains=120 + + # Delete old binary entries + archivebox binary list --name=chrome | archivebox binary delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox binary" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_binary( + name: str, + abspath: str, + version: str = "", +) -> int: + """ + Create/register a Binary. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + if not name or not abspath: + rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr) + return 1 + + try: + from archivebox.machine.models import Machine + + machine = Machine.current() + created = not Binary.objects.filter( + machine=machine, + name=name, + abspath=abspath, + version=version, + ).exists() + + # Mirror the Binary model lifecycle used elsewhere in the system so CLI + # records are owned by the current machine and can be safely piped into + # `archivebox run` without creating invalid rows missing machine_id. + binary = Binary.from_json( + { + "name": name, + "abspath": abspath, + "version": version, + "binproviders": "env", + "binprovider": "env", + }, + ) + if binary is None: + raise ValueError("failed to create binary record") + + if not is_tty: + write_record(binary.to_json()) + + if created: + rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr) + + return 0 + + except Exception as e: + rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr) + return 1 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_binaries( + name: str | None = None, + abspath__icontains: str | None = None, + version__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Binaries as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at") + + # Apply filters + filter_kwargs = { + "name": name, + "abspath__icontains": abspath__icontains, + "version__icontains": version__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for binary in queryset: + if is_tty: + rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}") + else: + write_record(binary.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_binaries( + version: str | None = None, + abspath: str | None = None, +) -> int: + """ + Update Binaries from stdin JSONL. + + Reads Binary records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + binary_id = record.get("id") + if not binary_id: + continue + + try: + binary = Binary.objects.get(id=binary_id) + + # Apply updates from CLI flags + if version: + binary.version = version + if abspath: + binary.abspath = abspath + + binary.save() + updated_count += 1 + + if not is_tty: + write_record(binary.to_json()) + + except Binary.DoesNotExist: + rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Binaries from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.machine.models import Binary + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + binary_ids = [r.get("id") for r in records if r.get("id")] + + if not binary_ids: + rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr) + return 1 + + binaries = Binary.objects.filter(id__in=binary_ids) + count = binaries.count() + + if count == 0: + rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr) + for binary in binaries: + rprint(f" {binary.name} {binary.abspath}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = binaries.delete() + rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Binary records (detected executables).""" + pass + + +@main.command("create") +@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)") +@click.option("--abspath", "-p", required=True, help="Absolute path to binary") +@click.option("--version", "-v", default="", help="Binary version") +def create_cmd(name: str, abspath: str, version: str): + """Create/register a Binary.""" + sys.exit(create_binary(name=name, abspath=abspath, version=version)) + + +@main.command("list") +@click.option("--name", "-n", help="Filter by name") +@click.option("--abspath__icontains", help="Filter by path contains") +@click.option("--version__icontains", help="Filter by version contains") +@click.option("--limit", type=int, help="Limit number of results") +def list_cmd( + name: str | None, + abspath__icontains: str | None, + version__icontains: str | None, + limit: int | None, +): + """List Binaries as JSONL.""" + sys.exit( + list_binaries( + name=name, + abspath__icontains=abspath__icontains, + version__icontains=version__icontains, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--version", "-v", help="Set version") +@click.option("--abspath", "-p", help="Set path") +def update_cmd(version: str | None, abspath: str | None): + """Update Binaries from stdin JSONL.""" + sys.exit(update_binaries(version=version, abspath=abspath)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Binaries from stdin JSONL.""" + sys.exit(delete_binaries(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py index 256219725a..f21087afc6 100644 --- a/archivebox/cli/archivebox_config.py +++ b/archivebox/cli/archivebox_config.py @@ -1,64 +1,177 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox config' +__package__ = "archivebox.cli" import sys -import argparse +import rich_click as click +from rich import print +from benedict import benedict -from typing import Optional, List, IO +from archivebox.misc.util import docstring, enforce_types +from archivebox.misc.toml_util import CustomTOMLEncoder -from ..main import config -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, accept_stdin +@enforce_types +def config( + *keys, + get: bool = False, + set: bool = False, + search: bool = False, + reset: bool = False, + **kwargs, +) -> None: + """Get and set your ArchiveBox project configuration values""" + from archivebox.misc.checks import check_data_folder + from archivebox.misc.logging_util import printable_config + from archivebox.config.collection import load_all_config, write_config_file, get_real_name + from archivebox.config.configset import get_flat_config, get_all_configs + + check_data_folder() + + FLAT_CONFIG = get_flat_config() + CONFIGS = get_all_configs() + + config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()]) + no_args = not (get or set or reset or config_options) + + matching_config = {} + if search: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} + for config_section in CONFIGS.values(): + aliases = getattr(config_section, "aliases", {}) + + for search_key in config_options: + # search all aliases in the section + for alias_key, key in aliases.items(): + if search_key.lower() in alias_key.lower(): + matching_config[key] = dict(config_section)[key] + + # search all keys and values in the section + for existing_key, value in dict(config_section).items(): + if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): + matching_config[existing_key] = value + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + + elif get or no_args: + if config_options: + config_options = [get_real_name(key) for key in config_options] + matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG} + failed_config = [key for key in config_options if key not in FLAT_CONFIG] + if failed_config: + print("\n[red][X] These options failed to get[/red]") + print(" {}".format("\n ".join(config_options))) + raise SystemExit(1) + else: + matching_config = FLAT_CONFIG + + # Display core config sections + for config_section in CONFIGS.values(): + section_header = getattr(config_section, "toml_section_header", "") + if isinstance(section_header, str) and section_header: + print(f"[grey53]\\[{section_header}][/grey53]") + else: + print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]") + + kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config} + print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n")) + print("[grey53]################################################################[/grey53]") + + # Display plugin config section + from archivebox.hooks import discover_plugin_configs + + plugin_configs = discover_plugin_configs() + plugin_keys = {} + + # Collect all plugin config keys + for plugin_name, schema in plugin_configs.items(): + if "properties" not in schema: + continue + for key in schema["properties"].keys(): + if key in matching_config: + plugin_keys[key] = matching_config[key] + + # Display all plugin config in single [PLUGINS] section + if plugin_keys: + print("[grey53]\\[PLUGINS][/grey53]") + print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n")) + print("[grey53]################################################################[/grey53]") + + raise SystemExit(not matching_config) + + elif set: + new_config = {} + failed_options = [] + for line in config_options: + if line.startswith("#") or not line.strip(): + continue + if "=" not in line: + print("[red][X] Config KEY=VALUE must have an = sign in it[/red]") + print(f" {line}") + raise SystemExit(2) + + raw_key, val = line.split("=", 1) + raw_key = raw_key.upper().strip() + key = get_real_name(raw_key) + if key != raw_key: + print( + f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]", + ) + + if key in FLAT_CONFIG: + new_config[key] = val.strip() + else: + failed_options.append(line) + + if new_config: + before = FLAT_CONFIG + matching_config = write_config_file(new_config) + after = {**load_all_config(), **get_flat_config()} + print(printable_config(matching_config)) + + side_effect_changes = {} + for key, val in after.items(): + if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): + side_effect_changes[key] = after[key] + + if side_effect_changes: + print(file=sys.stderr) + print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr) + print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr) + + if failed_options: + print() + print("[red][X] These options failed to set (check for typos):[/red]") + print(" {}".format("\n ".join(failed_options))) + raise SystemExit(1) + + elif reset: + print("[red][X] This command is not implemented yet.[/red]") + print(" Please manually remove the relevant lines from your config file:") + raise SystemExit(2) + + else: + print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]") + print(" archivebox config") + print(" archivebox config --get SOME_KEY") + print(" archivebox config --set SOME_KEY=SOME_VALUE") + raise SystemExit(2) + + +@click.command() +@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term") +@click.option("--get", is_flag=True, help="Get the value for the given config KEYs") +@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values") +@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults") +@click.argument("KEY=VALUE", nargs=-1, type=str) @docstring(config.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=config.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--get', #'-g', - action='store_true', - help="Get the value for the given config KEYs", - ) - group.add_argument( - '--set', #'-s', - action='store_true', - help="Set the given KEY=VALUE config values", - ) - group.add_argument( - '--reset', #'-s', - action='store_true', - help="Reset the given KEY config values to their defaults", - ) - parser.add_argument( - 'config_options', - nargs='*', - type=str, - help='KEY or KEY=VALUE formatted config values to get or set', - ) - command = parser.parse_args(args or ()) - - config_options_str = '' - if not command.config_options: - config_options_str = accept_stdin(stdin) - - config( - config_options_str=config_options_str, - config_options=command.config_options, - get=command.get, - set=command.set, - reset=command.reset, - out_dir=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +def main(**kwargs) -> None: + config(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py new file mode 100644 index 0000000000..c2b3c90136 --- /dev/null +++ b/archivebox/cli/archivebox_crawl.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 + +""" +archivebox crawl [args...] [--filters] + +Manage Crawl records. + +Actions: + create - Create Crawl jobs from URLs + list - List Crawls as JSONL (with optional filters) + update - Update Crawls from stdin JSONL + delete - Delete Crawls from stdin JSONL + +Examples: + # Create + archivebox crawl create https://example.com https://foo.com --depth=1 + archivebox crawl create --tag=news https://example.com + + # List with filters + archivebox crawl list --status=queued + archivebox crawl list --urls__icontains=example.com + + # Update + archivebox crawl list --status=started | archivebox crawl update --status=queued + + # Delete + archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes + + # Full pipeline + archivebox crawl create https://example.com | archivebox snapshot create | archivebox run +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox crawl" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_crawl( + urls: Iterable[str], + depth: int = 0, + tag: str = "", + status: str = "queued", + created_by_id: int | None = None, +) -> int: + """ + Create a Crawl job from URLs. + + Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL. + Pass-through: Records that are not URLs are output unchanged (for piping). + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + + created_by_id = created_by_id or get_or_create_system_user_pk() + is_tty = sys.stdout.isatty() + + # Collect all input records + records = list(read_args_or_stdin(urls)) + + if not records: + rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Separate pass-through records from URL records + url_list = [] + pass_through_records = [] + + for record in records: + record_type = record.get("type", "") + + # Pass-through: output records that aren't URL/Crawl types + if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"): + pass_through_records.append(record) + continue + + # Handle existing Crawl records (just pass through with id) + if record_type == TYPE_CRAWL and record.get("id"): + pass_through_records.append(record) + continue + + # Collect URLs + url = record.get("url") + if url: + url_list.append(url) + + # Handle 'urls' field (newline-separated) + urls_field = record.get("urls") + if urls_field: + for line in urls_field.split("\n"): + line = line.strip() + if line and not line.startswith("#"): + url_list.append(line) + + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + + if not url_list: + if pass_through_records: + # If we had pass-through records but no URLs, that's OK + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr) + return 0 + rprint("[red]No valid URLs found[/red]", file=sys.stderr) + return 1 + + try: + # Build crawl record with all URLs as newline-separated string + crawl_record = { + "urls": "\n".join(url_list), + "max_depth": depth, + "tags_str": tag, + "status": status, + "label": "", + } + + crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id}) + if not crawl: + rprint("[red]Failed to create crawl[/red]", file=sys.stderr) + return 1 + + # Output JSONL record (only when piped) + if not is_tty: + write_record(crawl.to_json()) + + rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr) + + # If TTY, show human-readable output + if is_tty: + rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr) + for url in url_list[:5]: # Show first 5 URLs + rprint(f" {url[:70]}", file=sys.stderr) + if len(url_list) > 5: + rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr) + + return 0 + + except Exception as e: + rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr) + return 1 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_crawls( + status: str | None = None, + urls__icontains: str | None = None, + max_depth: int | None = None, + limit: int | None = None, +) -> int: + """ + List Crawls as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.crawls.models import Crawl + + is_tty = sys.stdout.isatty() + + queryset = Crawl.objects.all().order_by("-created_at") + + # Apply filters + filter_kwargs = { + "status": status, + "urls__icontains": urls__icontains, + "max_depth": max_depth, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for crawl in queryset: + if is_tty: + status_color = { + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(crawl.status, "dim") + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...") + else: + write_record(crawl.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_crawls( + status: str | None = None, + max_depth: int | None = None, +) -> int: + """ + Update Crawls from stdin JSONL. + + Reads Crawl records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.crawls.models import Crawl + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + crawl_id = record.get("id") + if not crawl_id: + continue + + try: + crawl = Crawl.objects.get(id=crawl_id) + + # Apply updates from CLI flags + if status: + crawl.status = status + crawl.retry_at = timezone.now() + if max_depth is not None: + crawl.max_depth = max_depth + + crawl.save() + updated_count += 1 + + if not is_tty: + write_record(crawl.to_json()) + + except Crawl.DoesNotExist: + rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Crawls from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.crawls.models import Crawl + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + crawl_ids = [r.get("id") for r in records if r.get("id")] + + if not crawl_ids: + rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr) + return 1 + + crawls = Crawl.objects.filter(id__in=crawl_ids) + count = crawls.count() + + if count == 0: + rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr) + for crawl in crawls: + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = crawls.delete() + rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Crawl records.""" + pass + + +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)") +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +def create_cmd(urls: tuple, depth: int, tag: str, status: str): + """Create a Crawl job from URLs or stdin.""" + sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status)) + + +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--urls__icontains", help="Filter by URLs contains") +@click.option("--max-depth", type=int, help="Filter by max depth") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + urls__icontains: str | None, + max_depth: int | None, + limit: int | None, +): + """List Crawls as JSONL.""" + sys.exit( + list_crawls( + status=status, + urls__icontains=urls__icontains, + max_depth=max_depth, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--max-depth", type=int, help="Set max depth") +def update_cmd(status: str | None, max_depth: int | None): + """Update Crawls from stdin JSONL.""" + sys.exit(update_crawls(status=status, max_depth=max_depth)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Crawls from stdin JSONL.""" + sys.exit(delete_crawls(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_crawl_compat.py b/archivebox/cli/archivebox_crawl_compat.py new file mode 100644 index 0000000000..e767967509 --- /dev/null +++ b/archivebox/cli/archivebox_crawl_compat.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox crawl" + +import sys + +import rich_click as click + +from archivebox.cli.archivebox_add import add + + +@click.command(context_settings={"ignore_unknown_options": True}) +@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)") +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility") +@click.argument("urls", nargs=-1) +def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]): + """Backwards-compatible `archivebox crawl URL...` entrypoint.""" + del status, wait + add(list(urls), depth=depth, tag=tag, index_only=True, bg=True) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py new file mode 100644 index 0000000000..054382c1a6 --- /dev/null +++ b/archivebox/cli/archivebox_extract.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 + +""" +archivebox extract [snapshot_ids...] [--plugins=NAMES] + +Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL. + +Input formats: + - Snapshot UUIDs (one per line) + - JSONL: {"type": "Snapshot", "id": "...", "url": "..."} + - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."} + +Output (JSONL): + {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."} + +Examples: + # Extract specific snapshot + archivebox extract 01234567-89ab-cdef-0123-456789abcdef + + # Pipe from snapshot command + archivebox snapshot https://example.com | archivebox extract + + # Run specific plugins only + archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef + + # Chain commands + archivebox crawl https://example.com | archivebox snapshot | archivebox extract +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox extract" + +import sys +from collections import defaultdict + +import rich_click as click + + +def process_archiveresult_by_id(archiveresult_id: str) -> int: + """ + Re-run extraction for a single ArchiveResult by ID. + + ArchiveResults are projected status rows, not queued work items. Re-running + a single result means resetting that row and queueing its parent snapshot + through the shared crawl runner with the corresponding plugin selected. + """ + from rich import print as rprint + from django.utils import timezone + from archivebox.core.models import ArchiveResult + from archivebox.services.runner import run_crawl + + try: + archiveresult = ArchiveResult.objects.get(id=archiveresult_id) + except ArchiveResult.DoesNotExist: + rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr) + return 1 + + rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr) + + try: + archiveresult.reset_for_retry() + snapshot = archiveresult.snapshot + snapshot.status = snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + + crawl = snapshot.crawl + if crawl.status != crawl.StatusChoices.STARTED: + crawl.status = crawl.StatusChoices.QUEUED + crawl.retry_at = timezone.now() + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + + run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin]) + archiveresult.refresh_from_db() + + if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: + print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]") + return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS: + print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]") + return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: + print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr) + return 1 + else: + # Still in progress or backoff - not a failure + print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]") + return 0 + + except Exception as e: + print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + return 1 + + +def run_plugins( + args: tuple, + records: list[dict] | None = None, + plugins: str = "", + wait: bool = True, + emit_results: bool = True, +) -> int: + """ + Run plugins on Snapshots from input. + + Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL. + + Exit codes: + 0: Success + 1: Failure + """ + from rich import print as rprint + from django.utils import timezone + + from archivebox.misc.jsonl import ( + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + ) + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_crawl + + is_tty = sys.stdout.isatty() + + # Parse comma-separated plugins list once (reused in creation and filtering) + plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else [] + + # Parse stdin/args exactly once per CLI invocation. + # `main()` may already have consumed stdin to distinguish Snapshot input from + # ArchiveResult IDs; if so, it must pass the parsed records through here + # instead of asking this helper to reread an already-drained pipe. + if records is None: + records = list(read_args_or_stdin(args)) + + if not records: + rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Gather snapshot IDs and optional plugin constraints to process + snapshot_ids = set() + requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set) + for record in records: + record_type = record.get("type") + + if record_type == TYPE_SNAPSHOT: + snapshot_id = record.get("id") + if snapshot_id: + snapshot_ids.add(snapshot_id) + elif record.get("url"): + # Look up by URL (get most recent if multiple exist) + snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first() + if snap: + snapshot_ids.add(str(snap.id)) + else: + rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr) + + elif record_type == TYPE_ARCHIVERESULT: + snapshot_id = record.get("snapshot_id") + if snapshot_id: + snapshot_ids.add(snapshot_id) + plugin_name = record.get("plugin") + if plugin_name and not plugins_list: + requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name)) + + elif "id" in record: + # Assume it's a snapshot ID + snapshot_ids.add(record["id"]) + + if not snapshot_ids: + rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr) + return 1 + + # Get snapshots and ensure they have pending ArchiveResults + processed_count = 0 + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr) + continue + + requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set()) + for plugin_name in requested_plugin_names: + existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first() + if existing_result: + existing_result.reset_for_retry() + + # Reset snapshot status to allow processing + if snapshot.status == Snapshot.StatusChoices.SEALED: + snapshot.status = Snapshot.StatusChoices.STARTED + snapshot.retry_at = timezone.now() + snapshot.save() + + processed_count += 1 + + if processed_count == 0: + rprint("[red]No snapshots to process[/red]", file=sys.stderr) + return 1 + + rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr) + + # Run orchestrator if --wait (default) + if wait: + rprint("[blue]Running plugins...[/blue]", file=sys.stderr) + snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id) + except Snapshot.DoesNotExist: + continue + snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id)) + + for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items(): + selected_plugins = ( + plugins_list + or sorted( + {plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())}, + ) + or None + ) + run_crawl( + crawl_id, + snapshot_ids=sorted(crawl_snapshot_ids), + selected_plugins=selected_plugins, + ) + + if not emit_results: + return 0 + + # Output results as JSONL (when piped) or human-readable (when TTY) + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + results = snapshot.archiveresult_set.all() + if plugins_list: + results = results.filter(plugin__in=plugins_list) + + for result in results: + if is_tty: + status_color = { + "succeeded": "green", + "failed": "red", + "skipped": "yellow", + }.get(result.status, "dim") + rprint( + f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}", + file=sys.stderr, + ) + else: + write_record(result.to_json()) + except Snapshot.DoesNotExist: + continue + + return 0 + + +def is_archiveresult_id(value: str) -> bool: + """Check if value looks like an ArchiveResult UUID.""" + import re + + uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I) + if not uuid_pattern.match(value): + return False + # Verify it's actually an ArchiveResult (not a Snapshot or other object) + from archivebox.core.models import ArchiveResult + + return ArchiveResult.objects.filter(id=value).exists() + + +@click.command() +@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)") +@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)") +@click.argument("args", nargs=-1) +def main(plugins: str, wait: bool, args: tuple): + """Run plugins on Snapshots, or process existing ArchiveResults by ID""" + from archivebox.misc.jsonl import read_args_or_stdin + + # Read all input + records = list(read_args_or_stdin(args)) + + if not records: + from rich import print as rprint + + rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr) + sys.exit(1) + + # Check if input looks like existing ArchiveResult IDs to process + all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records) + + if all_are_archiveresult_ids: + # Process existing ArchiveResults by ID + from rich import print as rprint + + exit_code = 0 + for record in records: + archiveresult_id = record.get("id") or record.get("url") + if not isinstance(archiveresult_id, str): + rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr) + exit_code = 1 + continue + result = process_archiveresult_by_id(archiveresult_id) + if result != 0: + exit_code = result + sys.exit(exit_code) + else: + # Default behavior: run plugins on Snapshots from input + sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py index 46f17cbc2b..86d0be86dd 100755 --- a/archivebox/cli/archivebox_help.py +++ b/archivebox/cli/archivebox_help.py @@ -1,32 +1,131 @@ #!/usr/bin/env python3 +__package__ = "archivebox.cli" +__command__ = "archivebox help" -__package__ = 'archivebox.cli' -__command__ = 'archivebox help' +import os +from pathlib import Path -import sys -import argparse +import click +from rich import print +from rich.panel import Panel -from typing import Optional, List, IO -from ..main import help -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +def help() -> None: + """Print the ArchiveBox help message and usage""" + from archivebox.cli import ArchiveBoxGroup + from archivebox.config import CONSTANTS + from archivebox.config.permissions import IN_DOCKER + from archivebox.misc.logging_util import log_cli_command -@docstring(help.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=help.__doc__, - add_help=True, - formatter_class=SmartFormatter, + log_cli_command("help", [], None, ".") + + COMMANDS_HELP_TEXT = ( + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys() + ) + + "\n\n " + + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys() + ) + + "\n\n " + + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys() + ) + ) + + DOCKER_USAGE = ( + """ +[dodger_blue3]Docker Usage:[/dodger_blue3] + [grey53]# using Docker Compose:[/grey53] + [blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + + [grey53]# using Docker:[/grey53] + [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] +""" + if IN_DOCKER + else "" ) - parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - help(out_dir=pwd or OUTPUT_DIR) + DOCKER_DOCS = ( + "\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]" + if IN_DOCKER + else "" + ) + DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else "" + DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else "" + + print(f"""{DOCKER_USAGE} +[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} + [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + +[deep_sky_blue4]Commands:[/deep_sky_blue4] + {COMMANDS_HELP_TEXT} + +[deep_sky_blue4]Documentation:[/deep_sky_blue4] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS} + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link] +""") + + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): + pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~") + EXAMPLE_USAGE = f""" +[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] + +[violet]Hint:[/violet] [i]Common maintenance tasks:[/i] + [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53] + [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53] + [dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53] + [dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53] + +[violet]Hint:[/violet] [i]More example usage:[/i] + [dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page" + [dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title + [dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss" + [dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53] +""" + print( + Panel( + EXAMPLE_USAGE, + expand=False, + border_style="grey53", + title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]", + subtitle="Commands run inside this dir will only apply to this collection.", + ), + ) + else: + DATA_SETUP_HELP = "\n" + if IN_DOCKER: + DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n" + DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n" + DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n" + DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n" + DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n" + DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n" + print( + Panel( + DATA_SETUP_HELP, + expand=False, + border_style="grey53", + title="[red]:cross_mark: No collection is currently active[/red]", + subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) + + +@click.command() +@click.option("--help", "-h", is_flag=True, help="Show help") +def main(**kwargs): + """Print the ArchiveBox help message and usage""" + return help() -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 48b65b1f90..2376e1f312 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -1,52 +1,233 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox init' +__package__ = "archivebox.cli" +import os import sys -import argparse +from pathlib import Path +from collections.abc import Mapping -from typing import Optional, List, IO +from rich import print +import rich_click as click -from ..main import init -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import docstring, enforce_types -@docstring(init.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=init.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--force', # '-f', - action='store_true', - help='Ignore unrecognized files in current directory and initialize anyway', - ) - parser.add_argument( - '--quick', '-q', - action='store_true', - help='Run any updates or migrations without rechecking all snapshot dirs', - ) - parser.add_argument( - '--setup', #'-s', - action='store_true', - help='Automatically install dependencies and extras used for archiving', - ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - init( - force=command.force, - quick=command.quick, - setup=command.setup, - out_dir=pwd or OUTPUT_DIR, +def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None: + url = link_dict.get("url") + if not isinstance(url, str) or not url: + return None + + record: dict[str, object] = {"url": url} + for key in ("timestamp", "title", "tags", "sources"): + value = link_dict.get(key) + if value is not None: + record[key] = value + return url, record + + +@enforce_types +def init(force: bool = False, quick: bool = False, install: bool = False) -> None: + """Initialize a new ArchiveBox collection in the current directory""" + + from archivebox.config import CONSTANTS, VERSION, DATA_DIR + from archivebox.config.common import SERVER_CONFIG + from archivebox.config.collection import write_config_file + from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details + from archivebox.misc.db import apply_migrations + + # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): + # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) + # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) + + is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR) + existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE) + if is_empty and not existing_index: + print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]") + print("[green]----------------------------------------------------------------------[/green]") + elif existing_index: + # TODO: properly detect and print the existing version in current index as well + print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]") + print("[green]----------------------------------------------------------------------[/green]") + else: + if force: + print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]") + print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]") + else: + print( + "[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" + " You must run init in a completely empty directory, or an existing data folder.\n\n" + " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n" + " then run and run 'archivebox init' to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)", + ) + raise SystemExit(2) + + if existing_index: + print("\n[green][*] Verifying archive folder structure...[/green]") + else: + print("\n[green][+] Building archive folder structure...[/green]") + + print( + f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...", ) - + Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True) + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + + print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...") + + # create the .archivebox_id file with a unique ID for this collection + from archivebox.config.paths import _get_collection_id + + _get_collection_id(DATA_DIR, force_create=True) + + # create the ArchiveBox.conf file + write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY}) + + if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): + print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]") + else: + print("\n[green][+] Building main SQL index and running initial migrations...[/green]") + + from archivebox.config.django import setup_django + + setup_django() + + for migration_line in apply_migrations(DATA_DIR): + sys.stdout.write(f" {migration_line}\n") + + assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK) + print() + print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}") + + # from django.contrib.auth.models import User + # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) + # call_command("createsuperuser", interactive=True) + + print() + print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]") + + from archivebox.core.models import Snapshot + + all_links = Snapshot.objects.none() + pending_links: dict[str, dict[str, object]] = {} + + if existing_index: + all_links = Snapshot.objects.all() + print(f" √ Loaded {all_links.count()} links from existing main index.") + + if quick: + print(" > Skipping orphan snapshot import (quick mode)") + else: + try: + # Import orphaned links from legacy JSON indexes + orphaned_json_links: dict[str, dict[str, object]] = {} + for link_dict in parse_json_main_index(DATA_DIR): + normalized = _normalize_snapshot_record(link_dict) + if normalized is None: + continue + url, record = normalized + if not all_links.filter(url=url).exists(): + orphaned_json_links[url] = record + if orphaned_json_links: + pending_links.update(orphaned_json_links) + print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]") + + orphaned_data_dir_links: dict[str, dict[str, object]] = {} + for link_dict in parse_json_links_details(DATA_DIR): + normalized = _normalize_snapshot_record(link_dict) + if normalized is None: + continue + url, record = normalized + if not all_links.filter(url=url).exists(): + orphaned_data_dir_links[url] = record + if orphaned_data_dir_links: + pending_links.update(orphaned_data_dir_links) + print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]") + + if pending_links: + for link_dict in pending_links.values(): + Snapshot.from_json(link_dict) + + # Hint for orphaned snapshot directories + print() + print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:") + print(" archivebox update") + + except (KeyboardInterrupt, SystemExit): + print(file=sys.stderr) + print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr) + print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr) + print(" archivebox init --quick", file=sys.stderr) + raise SystemExit(1) + + print("\n[green]----------------------------------------------------------------------[/green]") + + from django.contrib.auth.models import User + + if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter( + username=SERVER_CONFIG.ADMIN_USERNAME, + ).exists(): + print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]") + User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD) + + if existing_index: + print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]") + else: + print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]") + + CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) + (CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True) + + from archivebox.config.common import STORAGE_CONFIG + from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) + STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) + (STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True) + + working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) + if working_tmp_dir: + working_tmp_dir.mkdir(parents=True, exist_ok=True) + + working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) + if working_lib_dir: + working_lib_dir.mkdir(parents=True, exist_ok=True) + (working_lib_dir / "bin").mkdir(parents=True, exist_ok=True) + + if install: + from archivebox.cli.archivebox_install import install as install_method + + install_method() + + if Snapshot.objects.count() < 25: # hide the hints for experienced users + print() + print(" [violet]Hint:[/violet] To view your archive index, run:") + print( + " archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]", + ) + print() + print(" To add new links, you can run:") + print(" archivebox add < ~/some/path/to/list_of_links.txt") + print() + print(" For more usage and examples, run:") + print(" archivebox help") + + +@click.command() +@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway") +@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs") +@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving") +@docstring(init.__doc__) +def main(**kwargs) -> None: + init(**kwargs) + -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py new file mode 100755 index 0000000000..a8f956cb25 --- /dev/null +++ b/archivebox/cli/archivebox_install.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import os + +import rich_click as click +from rich import print + +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None: + """Detect and install ArchiveBox dependencies by running the abx-dl install flow + + Examples: + archivebox install # Install all dependencies + archivebox install wget curl # Install only wget and curl + archivebox install --binproviders=pip yt-dlp # Install yt-dlp using only pip + archivebox install --binproviders=brew,apt # Install all deps using only brew or apt + """ + + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.paths import ARCHIVE_DIR + from archivebox.misc.logging import stderr + from archivebox.cli.archivebox_init import init + + if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()): + init() # must init full index because we need a db to store Binary entries in + + # Show what we're installing + if binaries: + print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]") + else: + print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]") + + if binproviders != "*": + print(f"[green][+] Using providers: {binproviders}[/green]") + + if IS_ROOT: + EUID = os.geteuid() + print() + print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]") + print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].") + print() + + if dry_run: + print("[dim]Dry run - would run the abx-dl install flow[/dim]") + return + + # Set up Django + from archivebox.config.django import setup_django + + setup_django() + + plugin_names = list(binaries) + if binproviders != "*": + plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip()) + + print("[+] Running installer via abx-dl bus...") + print() + + from archivebox.services.runner import run_install + + run_install(plugin_names=plugin_names or None) + + print() + + # Check for superuser + from django.contrib.auth import get_user_model + + User = get_user_model() + + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): + stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green") + stderr(" archivebox manage createsuperuser") + + print() + + # Show version to display full status including installed binaries + # Django is already loaded, so just import and call the function directly + from archivebox.cli.archivebox_version import version as show_version + + show_version(quiet=False) + + +@click.command() +@click.argument("binaries", nargs=-1, type=str, required=False) +@click.option( + "--binproviders", + "-p", + default="*", + help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all", + show_default=True, +) +@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False) +@docstring(install.__doc__) +def main(**kwargs) -> None: + install(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 5477bfc86c..1435945320 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -1,139 +1,57 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox list' +__package__ = "archivebox.cli" +__command__ = "archivebox list" import sys -import argparse -from typing import Optional, List, IO - -from ..main import list_all -from ..util import docstring -from ..config import OUTPUT_DIR -from ..index import ( - LINK_FILTERS, - get_indexed_folders, - get_archived_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_invalid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) -from ..logging_util import SmartFormatter, reject_stdin, stderr - - -@docstring(list_all.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=list_all.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--csv', #'-c', - type=str, - help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension", - default=None, - ) - group.add_argument( - '--json', #'-j', - action='store_true', - help="Print the output in JSON format with all columns included", - ) - group.add_argument( - '--html', - action='store_true', - help="Print the output in HTML format" - ) - parser.add_argument( - '--with-headers', - action='store_true', - help='Include the headers in the output document' - ) - parser.add_argument( - '--sort', #'-s', - type=str, - help="List the links sorted using the given key, e.g. timestamp or updated", - default=None, - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="List only links bookmarked before (less than) the given timestamp", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="List only links bookmarked after (greater than or equal to) the given timestamp", - default=None, - ) - parser.add_argument( - '--status', - type=str, - choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), - default='indexed', - help=( - 'List only links or data directories that have the given status\n' - f' indexed {get_indexed_folders.__doc__} (the default)\n' - f' archived {get_archived_folders.__doc__}\n' - f' unarchived {get_unarchived_folders.__doc__}\n' - '\n' - f' present {get_present_folders.__doc__}\n' - f' valid {get_valid_folders.__doc__}\n' - f' invalid {get_invalid_folders.__doc__}\n' - '\n' - f' duplicate {get_duplicate_folders.__doc__}\n' - f' orphaned {get_orphaned_folders.__doc__}\n' - f' corrupted {get_corrupted_folders.__doc__}\n' - f' unrecognized {get_unrecognized_folders.__doc__}\n' - ) - ) - parser.add_argument( - '--filter-type', '-t', - type=str, - choices=(*LINK_FILTERS.keys(), 'search'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - default=None, - help='List only URLs matching these filter patterns' - ) - command = parser.parse_args(args or ()) - reject_stdin(stdin) - - if command.with_headers and not (command.json or command.html or command.csv): - stderr( - '[X] --with-headers can only be used with --json, --html or --csv options\n', - color='red', - ) - raise SystemExit(2) - - matching_folders = list_all( - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - status=command.status, - after=command.after, - before=command.before, - sort=command.sort, - csv=command.csv, - json=command.json, - html=command.html, - with_headers=command.with_headers, - out_dir=pwd or OUTPUT_DIR, - ) - raise SystemExit(not matching_folders) - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +import rich_click as click + +from archivebox.cli.archivebox_snapshot import list_snapshots + + +@click.command() +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--url__icontains", help="Filter by URL contains") +@click.option("--url__istartswith", help="Filter by URL starts with") +@click.option("--tag", "-t", help="Filter by tag name") +@click.option("--crawl-id", help="Filter by crawl ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title") +@click.option("--with-headers", is_flag=True, help="Include column headers in structured output") +@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query") +@click.argument("query", nargs=-1) +def main( + status: str | None, + url__icontains: str | None, + url__istartswith: str | None, + tag: str | None, + crawl_id: str | None, + limit: int | None, + sort: str | None, + csv: str | None, + with_headers: bool, + search: str | None, + query: tuple[str, ...], +) -> None: + """List Snapshots.""" + sys.exit( + list_snapshots( + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + csv=csv, + with_headers=with_headers, + search=search, + query=" ".join(query), + ), + ) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py new file mode 100644 index 0000000000..688216c559 --- /dev/null +++ b/archivebox/cli/archivebox_machine.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +""" +archivebox machine [--filters] + +Manage Machine records (system-managed, mostly read-only). + +Machine records track the host machines where ArchiveBox runs. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Machines as JSONL (with optional filters) + +Examples: + # List all machines + archivebox machine list + + # List machines by hostname + archivebox machine list --hostname__icontains=myserver +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox machine" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_machines( + hostname__icontains: str | None = None, + os_platform: str | None = None, + limit: int | None = None, +) -> int: + """ + List Machines as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Machine + + is_tty = sys.stdout.isatty() + + queryset = Machine.objects.all().order_by("-created_at") + + # Apply filters + filter_kwargs = { + "hostname__icontains": hostname__icontains, + "os_platform": os_platform, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for machine in queryset: + if is_tty: + rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}") + else: + write_record(machine.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Machine records (read-only, system-managed).""" + pass + + +@main.command("list") +@click.option("--hostname__icontains", help="Filter by hostname contains") +@click.option("--os-platform", help="Filter by OS platform") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None): + """List Machines as JSONL.""" + sys.exit( + list_machines( + hostname__icontains=hostname__icontains, + os_platform=os_platform, + limit=limit, + ), + ) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py index f05604e183..7105161c17 100644 --- a/archivebox/cli/archivebox_manage.py +++ b/archivebox/cli/archivebox_manage.py @@ -1,24 +1,34 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox manage' +__package__ = "archivebox.cli" -import sys +import rich_click as click +from archivebox.misc.util import docstring, enforce_types -from typing import Optional, List, IO -from ..main import manage -from ..util import docstring -from ..config import OUTPUT_DIR +@enforce_types +def manage(args: list[str] | None = None) -> None: + """Run an ArchiveBox Django management command""" + from archivebox.config.common import SHELL_CONFIG + from archivebox.misc.logging import stderr + if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): + stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow") + stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow") + stderr("") + + from django.core.management import execute_from_command_line + + execute_from_command_line(["manage.py", *(args or ["help"])]) + + +@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) +@click.argument("args", nargs=-1) @docstring(manage.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - manage( - args=args, - out_dir=pwd or OUTPUT_DIR, - ) +def main(args: list[str] | None = None) -> None: + manage(args=args) -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py new file mode 100644 index 0000000000..cbc2ba19e1 --- /dev/null +++ b/archivebox/cli/archivebox_mcp.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +archivebox mcp + +Start the Model Context Protocol (MCP) server in stdio mode. +Exposes all ArchiveBox CLI commands as MCP tools for AI agents. +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox mcp" + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def mcp(): + """ + Start the MCP server in stdio mode for AI agent control. + + The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands + as tools that AI agents can discover and execute. It communicates via JSON-RPC + 2.0 over stdin/stdout. + + Example usage with an MCP client: + archivebox mcp < requests.jsonl > responses.jsonl + + Or interactively: + archivebox mcp + {"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} + {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} + """ + + from archivebox.mcp.server import run_mcp_server + + # Run the stdio server (blocks until stdin closes) + run_mcp_server() + + +@click.command() +@docstring(mcp.__doc__) +def main(**kwargs): + """Start the MCP server in stdio mode""" + mcp() + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_oneshot.py b/archivebox/cli/archivebox_oneshot.py deleted file mode 100644 index 411cce8b17..0000000000 --- a/archivebox/cli/archivebox_oneshot.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' -__command__ = 'archivebox oneshot' - -import sys -import argparse - -from pathlib import Path -from typing import List, Optional, IO - -from ..main import oneshot -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, accept_stdin, stderr - - -@docstring(oneshot.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=oneshot.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - 'url', - type=str, - default=None, - help=( - 'URLs or paths to archive e.g.:\n' - ' https://getpocket.com/users/USERNAME/feed/all\n' - ' https://example.com/some/rss/feed.xml\n' - ' https://example.com\n' - ' ~/Downloads/firefox_bookmarks_export.html\n' - ' ~/Desktop/sites_list.csv\n' - ) - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - parser.add_argument( - '--out-dir', - type=str, - default=OUTPUT_DIR, - help= "Path to save the single archive folder to, e.g. ./example.com_archive" - ) - command = parser.parse_args(args or ()) - stdin_url = None - url = command.url - if not url: - stdin_url = accept_stdin(stdin) - - if (stdin_url and url) or (not stdin and not url): - stderr( - '[X] You must pass a URL/path to add via stdin or CLI arguments.\n', - color='red', - ) - raise SystemExit(2) - - oneshot( - url=stdin_url or url, - out_dir=Path(command.out_dir).resolve(), - extractors=command.extract, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py new file mode 100644 index 0000000000..7f930665e0 --- /dev/null +++ b/archivebox/cli/archivebox_persona.py @@ -0,0 +1,776 @@ +#!/usr/bin/env python3 + +""" +archivebox persona [args...] [--filters] + +Manage Persona records (browser profiles for archiving). + +Actions: + create - Create Personas + list - List Personas as JSONL (with optional filters) + update - Update Personas from stdin JSONL + delete - Delete Personas from stdin JSONL + +Examples: + # Create a new persona + archivebox persona create work + archivebox persona create --import=chrome personal + archivebox persona create --import=edge work + + # List all personas + archivebox persona list + + # Delete a persona + archivebox persona list --name=old | archivebox persona delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox persona" + +import os +import sys +import shutil +import platform +import subprocess +import tempfile +import json +from pathlib import Path +from collections.abc import Iterable +from collections import OrderedDict + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters +from archivebox.personas import importers as persona_importers + + +# ============================================================================= +# Browser Profile Locations +# ============================================================================= + + +def get_chrome_user_data_dir() -> Path | None: + """Get the default Chrome user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": # macOS + candidates = [ + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_brave_user_data_dir() -> Path | None: + """Get the default Brave user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_edge_user_data_dir() -> Path | None: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Microsoft Edge", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Microsoft" / "Edge" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_browser_binary(browser: str) -> str | None: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == "Darwin": + candidates = { + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], + }.get(browser, []) + elif system == "Linux": + candidates = { + "chrome": [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + ], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": [ + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ], + }.get(browser, []) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = { + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + ], + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + ], + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + +BROWSER_PROFILE_FINDERS = { + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, # Same locations + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, +} + +CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"} + + +# ============================================================================= +# Cookie Extraction via CDP +# ============================================================================= + +NETSCAPE_COOKIE_HEADER = [ + "# Netscape HTTP Cookie File", + "# https://curl.se/docs/http-cookies.html", + "# This file was generated by ArchiveBox persona cookie extraction", + "#", + "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue", + "", +] + + +def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]": + cookies = OrderedDict() + if not path.exists(): + return cookies + + for line in path.read_text().splitlines(): + if not line or line.startswith("#"): + continue + parts = line.split("\t") + if len(parts) < 7: + continue + domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] + key = (domain, cookie_path, name) + cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value) + return cookies + + +def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None: + lines = list(NETSCAPE_COOKIE_HEADER) + for cookie in cookies.values(): + lines.append("\t".join(cookie)) + path.write_text("\n".join(lines) + "\n") + + +def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: + existing = _parse_netscape_cookies(existing_file) + new = _parse_netscape_cookies(new_file) + for key, cookie in new.items(): + existing[key] = cookie + _write_netscape_cookies(existing_file, existing) + + +def extract_cookies_via_cdp( + user_data_dir: Path, + output_file: Path, + profile_dir: str | None = None, + chrome_binary: str | None = None, +) -> bool: + """ + Launch Chrome with the given user data dir and extract cookies via CDP. + + Returns True if successful, False otherwise. + """ + from archivebox.config.common import STORAGE_CONFIG + + # Find the cookie extraction script + chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome" + extract_script = chrome_plugin_dir / "extract_cookies.js" + + if not extract_script.exists(): + rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr) + return False + + # Get node modules dir + node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" + + # Set up environment + env = os.environ.copy() + env["NODE_MODULES_DIR"] = str(node_modules_dir) + env["CHROME_USER_DATA_DIR"] = str(user_data_dir) + env["CHROME_HEADLESS"] = "true" + if chrome_binary: + env["CHROME_BINARY"] = str(chrome_binary) + output_path = output_file + temp_output = None + temp_dir = None + if output_file.exists(): + temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_")) + temp_output = temp_dir / "cookies.txt" + output_path = temp_output + if profile_dir: + extra_arg = f"--profile-directory={profile_dir}" + existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip() + args_list = [] + if existing_extra: + if existing_extra.startswith("["): + try: + parsed = json.loads(existing_extra) + if isinstance(parsed, list): + args_list.extend(str(x) for x in parsed) + except Exception: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + else: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + args_list.append(extra_arg) + env["CHROME_ARGS_EXTRA"] = json.dumps(args_list) + + env["COOKIES_OUTPUT_FILE"] = str(output_path) + + try: + result = subprocess.run( + ["node", str(extract_script)], + env=env, + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode == 0: + if temp_output and temp_output.exists(): + _merge_netscape_cookies(output_file, temp_output) + return True + else: + rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr) + return False + + except subprocess.TimeoutExpired: + rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr) + return False + except FileNotFoundError: + rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr) + return False + except Exception as e: + rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr) + return False + finally: + if temp_dir and temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) + + +# ============================================================================= +# Validation Helpers +# ============================================================================= + + +def validate_persona_name(name: str) -> tuple[bool, str]: + """ + Validate persona name to prevent path traversal attacks. + + Returns: + (is_valid, error_message): tuple indicating if name is valid + """ + if not name or not name.strip(): + return False, "Persona name cannot be empty" + + # Check for path separators + if "/" in name or "\\" in name: + return False, "Persona name cannot contain path separators (/ or \\)" + + # Check for parent directory references + if ".." in name: + return False, "Persona name cannot contain parent directory references (..)" + + # Check for hidden files/directories + if name.startswith("."): + return False, "Persona name cannot start with a dot (.)" + + # Ensure name doesn't contain null bytes or other dangerous chars + if "\x00" in name or "\n" in name or "\r" in name: + return False, "Persona name contains invalid characters" + + return True, "" + + +def ensure_path_within_personas_dir(persona_path: Path) -> bool: + """ + Verify that a persona path is within PERSONAS_DIR. + + This is a safety check to prevent path traversal attacks where + a malicious persona name could cause operations on paths outside + the expected PERSONAS_DIR. + + Returns: + True if path is safe, False otherwise + """ + from archivebox.config.constants import CONSTANTS + + try: + # Resolve both paths to absolute paths + personas_dir = CONSTANTS.PERSONAS_DIR.resolve() + resolved_path = persona_path.resolve() + + # Check if resolved_path is a child of personas_dir + return resolved_path.is_relative_to(personas_dir) + except (ValueError, RuntimeError): + return False + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_personas( + names: Iterable[str], + import_from: str | None = None, + profile: str | None = None, +) -> int: + """ + Create Personas from names. + + If --import is specified, copy the browser profile to the persona directory + and extract cookies. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + name_list = list(names) if names else [] + + if not name_list: + rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr) + return 1 + + # Validate import source if specified + source_profile_dir = None + if import_from: + import_from = import_from.lower() + if import_from not in BROWSER_PROFILE_FINDERS: + rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr) + rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr) + return 1 + + source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]() + if not source_profile_dir: + rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr) + return 1 + + rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr) + + if profile is None and (source_profile_dir / "Default").exists(): + profile = "Default" + + browser_binary = get_browser_binary(import_from) + if browser_binary: + rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr) + + created_count = 0 + for name in name_list: + name = name.strip() + if not name: + continue + + # Validate persona name to prevent path traversal + is_valid, error_msg = persona_importers.validate_persona_name(name) + if not is_valid: + rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr) + continue + + persona, created = Persona.objects.get_or_create(name=name) + + if created: + persona.ensure_dirs() + created_count += 1 + rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr) + + cookies_file = Path(persona.path) / "cookies.txt" + + # Import browser profile if requested + if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None: + try: + import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile) + import_result = persona_importers.import_persona_from_source( + persona, + import_source, + copy_profile=True, + import_cookies=True, + capture_storage=False, + ) + except Exception as e: + rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr) + return 1 + + if import_result.profile_copied: + rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr) + if import_result.cookies_imported: + rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr) + elif not import_result.profile_copied: + rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr) + + for warning in import_result.warnings: + rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr) + + if not is_tty: + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) + + rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_personas( + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Personas as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + + queryset = Persona.objects.all().order_by("name") + + # Apply filters + filter_kwargs = { + "name": name, + "name__icontains": name__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for persona in queryset: + cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]" + chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]" + + if is_tty: + rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]") + else: + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) + count += 1 + + rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_personas(name: str | None = None) -> int: + """ + Update Personas from stdin JSONL. + + Reads Persona records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + persona_id = record.get("id") + old_name = record.get("name") + + if not persona_id and not old_name: + continue + + try: + if persona_id: + persona = Persona.objects.get(id=persona_id) + else: + persona = Persona.objects.get(name=old_name) + + # Apply updates from CLI flags + if name: + # Validate new name to prevent path traversal + is_valid, error_msg = persona_importers.validate_persona_name(name) + if not is_valid: + rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr) + continue + + # Rename the persona directory too + old_path = persona.path + persona.name = name + new_path = persona.path + + if old_path.exists() and old_path != new_path: + shutil.move(str(old_path), str(new_path)) + + persona.save() + + updated_count += 1 + + if not is_tty: + write_record( + { + "id": str(persona.id) if hasattr(persona, "id") else None, + "name": persona.name, + "path": str(persona.path), + }, + ) + + except Persona.DoesNotExist: + rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_personas(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Personas from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.personas.models import Persona + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Collect persona IDs or names + persona_ids = [] + persona_names = [] + for r in records: + if r.get("id"): + persona_ids.append(r["id"]) + elif r.get("name"): + persona_names.append(r["name"]) + + if not persona_ids and not persona_names: + rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr) + return 1 + + from django.db.models import Q + + query = Q() + if persona_ids: + query |= Q(id__in=persona_ids) + if persona_names: + query |= Q(name__in=persona_names) + + personas = Persona.objects.filter(query) + count = personas.count() + + if count == 0: + rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr) + for persona in personas: + rprint(f" {persona.name} ({persona.path})", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Delete persona directories and database records + deleted_count = 0 + for persona in personas: + persona_path = persona.path + + # Safety check: ensure path is within PERSONAS_DIR before deletion + if not ensure_path_within_personas_dir(persona_path): + rprint(f'[red]Security error: persona path "{persona_path}" is outside PERSONAS_DIR. Skipping deletion.[/red]', file=sys.stderr) + continue + + if persona_path.exists(): + shutil.rmtree(persona_path) + persona.delete() + deleted_count += 1 + + rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Persona records (browser profiles).""" + pass + + +@main.command("create") +@click.argument("names", nargs=-1) +@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)") +@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)") +def create_cmd(names: tuple, import_from: str | None, profile: str | None): + """Create Personas, optionally importing from a browser profile.""" + sys.exit(create_personas(names, import_from=import_from, profile=profile)) + + +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): + """List Personas as JSONL.""" + sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit)) + + +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): + """Update Personas from stdin JSONL.""" + sys.exit(update_personas(name=name)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Personas from stdin JSONL.""" + sys.exit(delete_personas(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py new file mode 100644 index 0000000000..547d05e38f --- /dev/null +++ b/archivebox/cli/archivebox_pluginmap.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +EVENT_FLOW_DIAGRAM = """ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ArchiveBox / abx-dl Flow │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ InstallEvent │ +│ └─ config.json > required_binaries │ +│ └─ BinaryRequestEvent │ +│ └─ on_BinaryRequest__* │ +│ └─ BinaryEvent │ +│ │ +│ CrawlEvent │ +│ └─ CrawlSetupEvent │ +│ └─ on_CrawlSetup__* │ +│ │ +│ CrawlStartEvent │ +│ └─ SnapshotEvent │ +│ └─ on_Snapshot__* │ +│ └─ ArchiveResult / Snapshot / Tag │ +│ │ +│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │ +│ CrawlCleanupEvent -> internal cleanup, no direct hook family │ +│ │ +│ ArchiveBox projects bus events into the DB; it no longer drives plugin │ +│ execution through the old queued model executor. │ +└─────────────────────────────────────────────────────────────────────────────┘ +""" + + +@enforce_types +def pluginmap( + show_disabled: bool = False, + event: str | None = None, + quiet: bool = False, +) -> dict: + """ + Show the current abx-dl event phases and their associated plugin hooks. + + This command reflects the new bus-driven runtime, not the legacy ArchiveBox + state-machine executor. Event names are normalized to hook prefixes by + stripping a trailing `Event`, then ArchiveBox checks whether any matching + `on_{EventFamily}__*` scripts actually exist. + """ + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + from rich import box + + from archivebox.hooks import ( + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + discover_hooks, + is_background_hook, + normalize_hook_event_name, + ) + + console = Console() + prnt = console.print + + event_phases = { + "InstallEvent": { + "description": "Pre-run dependency phase. Enabled plugins emit BinaryRequest events from config.json required_binaries.", + "emits": ["BinaryRequestEvent", "BinaryEvent", "ProcessEvent"], + }, + "BinaryRequestEvent": { + "description": "Provider phase. on_BinaryRequest hooks resolve or install requested binaries.", + "emits": ["BinaryEvent", "ProcessEvent"], + }, + "BinaryEvent": { + "description": "Resolved binary metadata event. Projected into the DB binary cache.", + "emits": [], + }, + "CrawlEvent": { + "description": "Root crawl lifecycle event emitted by the runner.", + "emits": ["CrawlSetupEvent", "CrawlStartEvent", "CrawlCleanupEvent", "CrawlCompletedEvent"], + }, + "CrawlSetupEvent": { + "description": "Crawl-scoped setup phase. on_CrawlSetup hooks launch/configure shared daemons and runtime state.", + "emits": ["ProcessEvent"], + }, + "SnapshotEvent": { + "description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.", + "emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"], + }, + "SnapshotCleanupEvent": { + "description": "Internal snapshot cleanup phase.", + "emits": ["ProcessKillEvent"], + }, + "CrawlCleanupEvent": { + "description": "Internal crawl cleanup phase.", + "emits": ["ProcessKillEvent"], + }, + } + + if event: + requested = str(event).strip() + if requested in event_phases: + event_phases = {requested: event_phases[requested]} + else: + normalized_requested = normalize_hook_event_name(requested) + matched_name = next((name for name in event_phases if normalize_hook_event_name(name) == normalized_requested), None) + if matched_name is None: + prnt(f'[red]Error: Unknown event "{requested}". Available: {", ".join(event_phases.keys())}[/red]') + return {} + event_phases = {matched_name: event_phases[matched_name]} + + result = { + "events": {}, + "plugins_dir": str(BUILTIN_PLUGINS_DIR), + "user_plugins_dir": str(USER_PLUGINS_DIR), + } + + if not quiet: + prnt() + prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]") + prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]") + prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]") + prnt() + prnt( + Panel( + EVENT_FLOW_DIAGRAM, + title="[bold green]Event Flow[/bold green]", + border_style="green", + expand=False, + ), + ) + prnt() + + for event_name, info in event_phases.items(): + hook_event = normalize_hook_event_name(event_name) + hooks = discover_hooks(event_name, filter_disabled=not show_disabled) + + hook_infos = [] + for hook_path in hooks: + plugin_name = hook_path.parent.name + hook_infos.append( + { + "path": str(hook_path), + "name": hook_path.name, + "plugin": plugin_name, + "is_background": is_background_hook(hook_path.name), + "extension": hook_path.suffix, + }, + ) + + result["events"][event_name] = { + "description": info["description"], + "hook_event": hook_event, + "emits": info["emits"], + "hooks": hook_infos, + "hook_count": len(hook_infos), + } + + if quiet: + continue + + title_suffix = f" -> on_{hook_event}__*" if hook_infos else "" + table = Table( + title=f"[bold yellow]{event_name}[/bold yellow]{title_suffix} ({len(hooks)} hooks)", + box=box.ROUNDED, + show_header=True, + header_style="bold magenta", + ) + table.add_column("Plugin", style="cyan", width=20) + table.add_column("Hook Name", style="green") + table.add_column("BG", justify="center", width=4) + table.add_column("Type", justify="center", width=5) + + if hook_infos: + for hook in sorted(hook_infos, key=lambda h: h["name"]): + bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else "" + table.add_row( + hook["plugin"], + hook["name"], + bg_marker, + hook["extension"].lstrip("."), + ) + else: + table.add_row("[dim]-[/dim]", "[dim]No direct hooks[/dim]", "", "") + + prnt(table) + prnt(f"[dim]{info['description']}[/dim]") + if info["emits"]: + prnt(f"[dim]Emits: {', '.join(info['emits'])}[/dim]") + if not hook_infos: + prnt(f"[dim]No direct on_{hook_event}__* scripts are currently defined for this event family.[/dim]") + prnt() + + if not quiet: + total_hooks = sum(event_info["hook_count"] for event_info in result["events"].values()) + prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]") + prnt() + prnt("[dim]Hook naming convention: on_{EventFamily}__{XX}_{description}[.bg].{ext}[/dim]") + prnt("[dim]Event names are normalized with a simple `Event` suffix strip before hook discovery.[/dim]") + prnt("[dim]If no `on_{EventFamily}__*` scripts exist, the event is shown as having no direct hooks.[/dim]") + prnt() + + return result + + +@click.command() +@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too") +@click.option("--event", "-e", type=str, default=None, help="Filter to specific event (e.g. InstallEvent, SnapshotEvent)") +@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no tables") +@docstring(pluginmap.__doc__) +def main(**kwargs): + import json + + result = pluginmap(**kwargs) + if kwargs.get("quiet"): + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py new file mode 100644 index 0000000000..4df39c7544 --- /dev/null +++ b/archivebox/cli/archivebox_process.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +""" +archivebox process [--filters] + +Manage Process records (system-managed, mostly read-only). + +Process records track executions of binaries during extraction. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Processes as JSONL (with optional filters) + +Examples: + # List all processes + archivebox process list + + # List processes by binary + archivebox process list --binary-name=chrome + + # List recent processes + archivebox process list --limit=10 +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox process" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_processes( + binary_name: str | None = None, + machine_id: str | None = None, + limit: int | None = None, +) -> int: + """ + List Processes as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Process + + is_tty = sys.stdout.isatty() + + queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts") + + # Apply filters + filter_kwargs = {} + if binary_name: + filter_kwargs["binary__name"] = binary_name + if machine_id: + filter_kwargs["machine_id"] = machine_id + + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for process in queryset: + if is_tty: + binary_name_str = process.binary.name if process.binary else "unknown" + exit_code = process.exit_code if process.exit_code is not None else "?" + status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow" + rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]") + else: + write_record(process.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Process records (read-only, system-managed).""" + pass + + +@main.command("list") +@click.option("--binary-name", "-b", help="Filter by binary name") +@click.option("--machine-id", "-m", help="Filter by machine ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None): + """List Processes as JSONL.""" + sys.exit( + list_processes( + binary_name=binary_name, + machine_id=machine_id, + limit=limit, + ), + ) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index dadf26544a..be3efcb4a2 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -1,82 +1,112 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox remove' +__package__ = "archivebox.cli" +__command__ = "archivebox remove" -import sys -import argparse +import shutil +from pathlib import Path +from collections.abc import Iterable -from typing import Optional, List, IO +import rich_click as click -from ..main import remove -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, accept_stdin +from django.db.models import QuerySet +from archivebox.config import DATA_DIR +from archivebox.config.constants import CONSTANTS +from archivebox.config.django import setup_django +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.checks import check_data_folder +from archivebox.misc.logging_util import ( + log_list_started, + log_list_finished, + log_removal_started, + log_removal_finished, + TimedProgress, +) + +@enforce_types +def remove( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + snapshots: QuerySet | None = None, + after: float | None = None, + before: float | None = None, + yes: bool = False, + delete: bool = False, + out_dir: Path = DATA_DIR, +) -> QuerySet: + """Remove the specified URLs from the archive""" + + setup_django() + check_data_folder() + + from archivebox.cli.archivebox_search import get_snapshots + + pattern_list = list(filter_patterns) + + log_list_started(pattern_list or None, filter_type) + timer = TimedProgress(360, prefix=" ") + try: + snapshots = get_snapshots( + snapshots=snapshots, + filter_patterns=pattern_list or None, + filter_type=filter_type, + after=after, + before=before, + ) + finally: + timer.end() + + if not snapshots.exists(): + log_removal_finished(0, 0) + raise SystemExit(1) + + log_list_finished(snapshots) + log_removal_started(snapshots, yes=yes, delete=delete) + + timer = TimedProgress(360, prefix=" ") + try: + for snapshot in snapshots: + if delete: + shutil.rmtree(snapshot.output_dir, ignore_errors=True) + legacy_path = CONSTANTS.ARCHIVE_DIR / snapshot.timestamp + if legacy_path.is_symlink(): + legacy_path.unlink(missing_ok=True) + finally: + timer.end() + + to_remove = snapshots.count() + + from archivebox.search import flush_search_index + from archivebox.core.models import Snapshot + + flush_search_index(snapshots=snapshots) + snapshots.delete() + all_snapshots = Snapshot.objects.all() + log_removal_finished(all_snapshots.count(), to_remove) + + return all_snapshots + + +@click.command() +@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm") +@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index") +@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp") +@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp") +@click.option( + "--filter-type", + "-f", + type=click.Choice(("exact", "substring", "domain", "regex", "tag")), + default="exact", + help="Type of pattern matching to use when filtering URLs", +) +@click.argument("filter_patterns", nargs=-1) @docstring(remove.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=remove.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--yes', # '-y', - action='store_true', - help='Remove links instantly without prompting to confirm.', - ) - parser.add_argument( - '--delete', # '-r', - action='store_true', - help=( - "In addition to removing the link from the index, " - "also delete its archived content and metadata folder." - ), - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="List only URLs bookmarked before the given timestamp.", - default=None, - ) - parser.add_argument( - '--after', #'-a', - type=float, - help="List only URLs bookmarked after the given timestamp.", - default=None, - ) - parser.add_argument( - '--filter-type', - type=str, - choices=('exact', 'substring', 'domain', 'regex','tag'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - help='URLs matching this filter pattern will be removed from the index.' - ) - command = parser.parse_args(args or ()) - - filter_str = None - if not command.filter_patterns: - filter_str = accept_stdin(stdin) - - remove( - filter_str=filter_str, - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - before=command.before, - after=command.after, - yes=command.yes, - delete=command.delete, - out_dir=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +def main(**kwargs): + """Remove the specified URLs from the archive""" + remove(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py new file mode 100644 index 0000000000..cca1e671b1 --- /dev/null +++ b/archivebox/cli/archivebox_run.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + +""" +archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...] [--binary-id=...] + +Unified command for processing queued work on the shared abx-dl bus. + +Modes: + - With stdin JSONL: Process piped records, exit when complete + - Without stdin (TTY): Run the background runner in foreground until killed + - --crawl-id: Run the crawl runner for a specific crawl only + - --snapshot-id: Run a specific snapshot through its parent crawl + - --binary-id: Emit a BinaryRequestEvent for a specific Binary row + +Examples: + # Run the background runner in foreground + archivebox run + + # Run as daemon (don't exit on idle) + archivebox run --daemon + + # Process specific records (pipe any JSONL type, exits when done) + archivebox snapshot list --status=queued | archivebox run + archivebox archiveresult list --status=failed | archivebox run + archivebox crawl list --status=queued | archivebox run + + # Mixed types work too + cat mixed_records.jsonl | archivebox run + + # Run the crawl runner for a specific crawl + archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e + + # Run one snapshot from an existing crawl + archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad + + # Run one queued binary install directly on the bus + archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox run" + +import sys +from collections import defaultdict + +import rich_click as click +from rich import print as rprint + + +def process_stdin_records() -> int: + """ + Process JSONL records from stdin. + + Create-or-update behavior: + - Records WITHOUT id: Create via Model.from_json(), then queue + - Records WITH id: Lookup existing, re-queue for processing + + Outputs JSONL of all processed records (for chaining). + + Handles any record type: Crawl, Snapshot, ArchiveResult. + Auto-cascades: Crawl → Snapshots → ArchiveResults. + + Returns exit code (0 = success, 1 = error). + """ + from django.utils import timezone + + from archivebox.misc.jsonl import ( + read_stdin, + write_record, + TYPE_CRAWL, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_BINARYREQUEST, + TYPE_BINARY, + ) + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.machine.models import Binary + from archivebox.services.runner import run_binary, run_crawl + + records = list(read_stdin()) + is_tty = sys.stdout.isatty() + + if not records: + return 0 # Nothing to process + + created_by_id = get_or_create_system_user_pk() + queued_count = 0 + output_records = [] + full_crawl_ids: set[str] = set() + snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) + plugin_names_by_crawl: dict[str, set[str]] = defaultdict(set) + run_all_plugins_for_crawl: set[str] = set() + binary_ids: list[str] = [] + + for record in records: + record_type = record.get("type", "") + record_id = record.get("id") + + try: + if record_type == TYPE_CRAWL: + if record_id: + # Existing crawl - re-queue + try: + crawl = Crawl.objects.get(id=record_id) + except Crawl.DoesNotExist: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + else: + # New crawl - create it + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + + if crawl: + crawl.retry_at = timezone.now() + if crawl.status not in [Crawl.StatusChoices.SEALED]: + crawl.status = Crawl.StatusChoices.QUEUED + crawl.save() + full_crawl_ids.add(str(crawl.id)) + run_all_plugins_for_crawl.add(str(crawl.id)) + output_records.append(crawl.to_json()) + queued_count += 1 + + elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type): + if record_id: + # Existing snapshot - re-queue + try: + snapshot = Snapshot.objects.get(id=record_id) + except Snapshot.DoesNotExist: + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + else: + # New snapshot - create it + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + + if snapshot: + snapshot.retry_at = timezone.now() + if snapshot.status not in [Snapshot.StatusChoices.SEALED]: + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.save() + crawl = snapshot.crawl + crawl.retry_at = timezone.now() + if crawl.status != Crawl.StatusChoices.STARTED: + crawl.status = Crawl.StatusChoices.QUEUED + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + crawl_id = str(snapshot.crawl_id) + snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) + run_all_plugins_for_crawl.add(crawl_id) + output_records.append(snapshot.to_json()) + queued_count += 1 + + elif record_type == TYPE_ARCHIVERESULT: + if record_id: + # Existing archiveresult - re-queue + try: + archiveresult = ArchiveResult.objects.get(id=record_id) + except ArchiveResult.DoesNotExist: + archiveresult = None + else: + archiveresult = None + + snapshot_id = record.get("snapshot_id") + plugin_name = record.get("plugin") + snapshot = None + if archiveresult: + if archiveresult.status in [ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ArchiveResult.StatusChoices.BACKOFF, + ]: + archiveresult.reset_for_retry() + snapshot = archiveresult.snapshot + plugin_name = plugin_name or archiveresult.plugin + elif snapshot_id: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + snapshot = None + + if snapshot: + snapshot.retry_at = timezone.now() + if snapshot.status != Snapshot.StatusChoices.STARTED: + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + crawl = snapshot.crawl + crawl.retry_at = timezone.now() + if crawl.status != Crawl.StatusChoices.STARTED: + crawl.status = Crawl.StatusChoices.QUEUED + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + crawl_id = str(snapshot.crawl_id) + snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) + if plugin_name: + plugin_names_by_crawl[crawl_id].add(str(plugin_name)) + output_records.append(record if not archiveresult else archiveresult.to_json()) + queued_count += 1 + + elif record_type in {TYPE_BINARYREQUEST, TYPE_BINARY}: + if record_id: + try: + binary = Binary.objects.get(id=record_id) + except Binary.DoesNotExist: + binary = Binary.from_json(record) + else: + binary = Binary.from_json(record) + + if binary: + binary.retry_at = timezone.now() + if binary.status != Binary.StatusChoices.INSTALLED: + binary.status = Binary.StatusChoices.QUEUED + binary.save() + binary_ids.append(str(binary.id)) + output_records.append(binary.to_json()) + queued_count += 1 + + else: + # Unknown type - pass through + output_records.append(record) + + except Exception as e: + rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr) + continue + + # Output all processed records (for chaining) + if not is_tty: + for rec in output_records: + write_record(rec) + + if queued_count == 0: + rprint("[yellow]No records to process[/yellow]", file=sys.stderr) + return 0 + + rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr) + + for binary_id in binary_ids: + run_binary(binary_id) + + targeted_crawl_ids = full_crawl_ids | set(snapshot_ids_by_crawl) + if targeted_crawl_ids: + for crawl_id in sorted(targeted_crawl_ids): + run_crawl( + crawl_id, + snapshot_ids=None if crawl_id in full_crawl_ids else sorted(snapshot_ids_by_crawl[crawl_id]), + selected_plugins=None if crawl_id in run_all_plugins_for_crawl else sorted(plugin_names_by_crawl[crawl_id]), + ) + return 0 + + +def run_runner(daemon: bool = False) -> int: + """ + Run the background runner loop. + + Args: + daemon: Run forever (don't exit when idle) + + Returns exit code (0 = success, 1 = error). + """ + from django.utils import timezone + from archivebox.machine.models import Machine, Process + from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls + + Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() + recover_orphaned_snapshots() + recover_orphaned_crawls() + Machine.current() + current = Process.current() + if current.process_type != Process.TypeChoices.ORCHESTRATOR: + current.process_type = Process.TypeChoices.ORCHESTRATOR + current.save(update_fields=["process_type", "modified_at"]) + + try: + run_pending_crawls(daemon=daemon) + return 0 + except KeyboardInterrupt: + return 0 + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + return 1 + finally: + current.refresh_from_db() + if current.status != Process.StatusChoices.EXITED: + current.status = Process.StatusChoices.EXITED + current.ended_at = current.ended_at or timezone.now() + current.save(update_fields=["status", "ended_at", "modified_at"]) + + +@click.command() +@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)") +@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only") +@click.option("--snapshot-id", help="Run one snapshot through its crawl") +@click.option("--binary-id", help="Run one queued binary install directly on the bus") +def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): + """ + Process queued work. + + Modes: + - No args + stdin piped: Process piped JSONL records + - No args + TTY: Run the crawl runner for all work + - --crawl-id: Run the crawl runner for that crawl only + - --snapshot-id: Run one snapshot through its crawl only + - --binary-id: Run one queued binary install directly on the bus + """ + if snapshot_id: + sys.exit(run_snapshot_worker(snapshot_id)) + + if binary_id: + try: + from archivebox.services.runner import run_binary + + run_binary(binary_id) + sys.exit(0) + except KeyboardInterrupt: + sys.exit(0) + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + import traceback + + traceback.print_exc() + sys.exit(1) + + if crawl_id: + try: + from archivebox.services.runner import run_crawl + + run_crawl(crawl_id) + sys.exit(0) + except KeyboardInterrupt: + sys.exit(0) + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + import traceback + + traceback.print_exc() + sys.exit(1) + + if daemon: + if not sys.stdin.isatty(): + exit_code = process_stdin_records() + if exit_code != 0: + sys.exit(exit_code) + sys.exit(run_runner(daemon=True)) + + if not sys.stdin.isatty(): + sys.exit(process_stdin_records()) + else: + sys.exit(run_runner(daemon=daemon)) + + +def run_snapshot_worker(snapshot_id: str) -> int: + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_crawl + + try: + snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) + run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)]) + return 0 + except KeyboardInterrupt: + return 0 + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + import traceback + + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py index f528e6a620..bb9c1dacfc 100644 --- a/archivebox/cli/archivebox_schedule.py +++ b/archivebox/cli/archivebox_schedule.py @@ -1,103 +1,177 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox schedule' +__package__ = "archivebox.cli" -import sys -import argparse +import rich_click as click +from rich import print -from typing import Optional, List, IO +from archivebox.misc.util import enforce_types, docstring +from archivebox.config.common import ARCHIVING_CONFIG -from ..main import schedule -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +@enforce_types +def schedule( + add: bool = False, + show: bool = False, + clear: bool = False, + foreground: bool = False, + run_all: bool = False, + quiet: bool = False, + every: str | None = None, + tag: str = "", + depth: int | str = 0, + overwrite: bool = False, + update: bool = not ARCHIVING_CONFIG.ONLY_NEW, + import_path: str | None = None, +): + """Manage database-backed scheduled crawls processed by the crawl runner.""" + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.crawls.schedule_utils import validate_schedule + from archivebox.services.runner import run_pending_crawls + + depth = int(depth) + result: dict[str, object] = { + "created_schedule_ids": [], + "disabled_count": 0, + "run_all_enqueued": 0, + "active_schedule_ids": [], + } + + def _active_schedules(): + return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at") + + if clear: + disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update( + is_enabled=False, + modified_at=timezone.now(), + ) + result["disabled_count"] = disabled_count + print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]") + + if every or add: + schedule_str = (every or "day").strip() + validate_schedule(schedule_str) + + created_by_id = get_or_create_system_user_pk() + is_update_schedule = not import_path + template_urls = import_path or "archivebox://update" + template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64] + template_notes = ( + f"Created by archivebox schedule for {template_urls}" + if import_path + else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls." + ) + + template = Crawl.objects.create( + urls=template_urls, + max_depth=0 if is_update_schedule else depth, + tags_str="" if is_update_schedule else tag, + label=template_label, + notes=template_notes, + created_by_id=created_by_id, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + config={ + "ONLY_NEW": not update, + "OVERWRITE": overwrite, + "DEPTH": 0 if is_update_schedule else depth, + "SCHEDULE_KIND": "update" if is_update_schedule else "crawl", + }, + ) + crawl_schedule = CrawlSchedule.objects.create( + template=template, + schedule=schedule_str, + is_enabled=True, + label=template_label, + notes=template_notes, + created_by_id=created_by_id, + ) + result["created_schedule_ids"] = [str(crawl_schedule.id)] + + schedule_type = "maintenance update" if is_update_schedule else "crawl" + print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]") + print(f" id={crawl_schedule.id}") + print(f" every={crawl_schedule.schedule}") + print(f" next_run={crawl_schedule.next_run_at.isoformat()}") + if import_path: + print(f" source={import_path}") + + schedules = list(_active_schedules()) + result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules] + + if show: + if schedules: + print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]") + for scheduled_crawl in schedules: + template = scheduled_crawl.template + print( + f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} " + f"next_run={scheduled_crawl.next_run_at.isoformat()} " + f"source={template.urls.splitlines()[0] if template.urls else ''}", + ) + else: + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") + + if run_all: + enqueued = 0 + now = timezone.now() + for scheduled_crawl in schedules: + scheduled_crawl.enqueue(queued_at=now) + enqueued += 1 + result["run_all_enqueued"] = enqueued + print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]") + if enqueued: + print( + "[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]", + ) + + if foreground: + print( + "[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]", + ) + run_pending_crawls(daemon=True) + + if quiet: + return result + + if not any((every, add, show, clear, foreground, run_all)): + if schedules: + print("[green]\\[*] Active scheduled crawls:[/green]") + for scheduled_crawl in schedules: + print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}") + else: + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") + + return result + + +@click.command() +@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output") +@click.option("--add", is_flag=True, help="Create a new scheduled crawl") +@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"') +@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots") +@click.option( + "--depth", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously") +@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run") +@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules") +@click.option("--show", is_flag=True, help="Print all currently enabled schedules") +@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)") +@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once") +@click.argument("import_path", required=False) @docstring(schedule.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=schedule.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--quiet', '-q', - action='store_true', - help=("Don't warn about storage space."), - ) - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--add', # '-a', - action='store_true', - help='Add a new scheduled ArchiveBox update job to cron', - ) - parser.add_argument( - '--every', # '-e', - type=str, - default=None, - help='Run ArchiveBox once every [timeperiod] (hour/day/month/year or cron format e.g. "0 0 * * *")', - ) - parser.add_argument( - '--depth', # '-d', - type=int, - choices=[0, 1], - default=0, - help='Depth to archive to [0] or 1, see "add" command help for more info', - ) - parser.add_argument( - '--overwrite', - action='store_true', - help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots', - ) - group.add_argument( - '--clear', # '-c' - action='store_true', - help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"), - ) - group.add_argument( - '--show', # '-s' - action='store_true', - help=("Print a list of currently active ArchiveBox cron jobs"), - ) - group.add_argument( - '--foreground', '-f', - action='store_true', - help=("Launch ArchiveBox scheduler as a long-running foreground task " - "instead of using cron."), - ) - group.add_argument( - '--run-all', # '-a', - action='store_true', - help=("Run all the scheduled jobs once immediately, independent of " - "their configured schedules, can be used together with --foreground"), - ) - parser.add_argument( - 'import_path', - nargs='?', - type=str, - default=None, - help=("Check this path and import any new links on every run " - "(can be either local file or remote URL)"), - ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - schedule( - add=command.add, - show=command.show, - clear=command.clear, - foreground=command.foreground, - run_all=command.run_all, - quiet=command.quiet, - every=command.every, - depth=command.depth, - overwrite=command.overwrite, - import_path=command.import_path, - out_dir=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +def main(**kwargs): + """Manage database-backed scheduled crawls processed by the crawl runner.""" + schedule(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py new file mode 100644 index 0000000000..86af83bb6c --- /dev/null +++ b/archivebox/cli/archivebox_search.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox search" + +import sys +from pathlib import Path +from typing import TYPE_CHECKING +from collections.abc import Callable + +import rich_click as click + +from django.db.models import Q, QuerySet + +from archivebox.config import DATA_DIR +from archivebox.misc.logging import stderr +from archivebox.misc.util import enforce_types, docstring + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + +# Filter types for URL matching +LINK_FILTERS: dict[str, Callable[[str], Q]] = { + "exact": lambda pattern: Q(url=pattern), + "substring": lambda pattern: Q(url__icontains=pattern), + "regex": lambda pattern: Q(url__iregex=pattern), + "domain": lambda pattern: ( + Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}") + ), + "tag": lambda pattern: Q(tags__name=pattern), + "timestamp": lambda pattern: Q(timestamp=pattern), +} + +STATUS_CHOICES = ["indexed", "archived", "unarchived"] + + +def _apply_pattern_filters( + snapshots: QuerySet["Snapshot", "Snapshot"], + filter_patterns: list[str], + filter_type: str, +) -> QuerySet["Snapshot", "Snapshot"]: + filter_builder = LINK_FILTERS.get(filter_type) + if filter_builder is None: + stderr() + stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red") + raise SystemExit(2) + + query = Q() + for pattern in filter_patterns: + query |= filter_builder(pattern) + return snapshots.filter(query) + + +def _snapshots_to_json( + snapshots: QuerySet["Snapshot", "Snapshot"], + *, + with_headers: bool, +) -> str: + from datetime import datetime, timezone as tz + + from archivebox.config import VERSION + from archivebox.config.common import SERVER_CONFIG + from archivebox.misc.util import to_json + + main_index_header = ( + { + "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", + "schema": "archivebox.index.json", + "copyright_info": SERVER_CONFIG.FOOTER_INFO, + "meta": { + "project": "ArchiveBox", + "version": VERSION, + "git_sha": VERSION, + "website": "https://ArchiveBox.io", + "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", + "source": "https://github.com/ArchiveBox/ArchiveBox", + "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", + "dependencies": {}, + }, + } + if with_headers + else {} + ) + + snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)] + output: dict[str, object] | list[dict[str, object]] + if with_headers: + output = { + **main_index_header, + "num_links": len(snapshot_dicts), + "updated": datetime.now(tz.utc), + "last_run_cmd": sys.argv, + "links": snapshot_dicts, + } + else: + output = snapshot_dicts + + return to_json(output, indent=4, sort_keys=True) + + +def _snapshots_to_csv( + snapshots: QuerySet["Snapshot", "Snapshot"], + *, + cols: list[str], + with_headers: bool, +) -> str: + header = ",".join(cols) if with_headers else "" + rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)] + return "\n".join((header, *rows)) + + +def _snapshots_to_html( + snapshots: QuerySet["Snapshot", "Snapshot"], + *, + with_headers: bool, +) -> str: + from datetime import datetime, timezone as tz + + from django.template.loader import render_to_string + + from archivebox.config import VERSION + from archivebox.config.common import SERVER_CONFIG + from archivebox.config.version import get_COMMIT_HASH + + template = "static_index.html" if with_headers else "minimal_index.html" + snapshot_list = list(snapshots.iterator(chunk_size=500)) + + return render_to_string( + template, + { + "version": VERSION, + "git_sha": get_COMMIT_HASH() or VERSION, + "num_links": str(len(snapshot_list)), + "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), + "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), + "links": snapshot_list, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + }, + ) + + +def get_snapshots( + snapshots: QuerySet["Snapshot", "Snapshot"] | None = None, + filter_patterns: list[str] | None = None, + filter_type: str = "substring", + after: float | None = None, + before: float | None = None, + out_dir: Path = DATA_DIR, +) -> QuerySet["Snapshot", "Snapshot"]: + """Filter and return Snapshots matching the given criteria.""" + from archivebox.core.models import Snapshot + + if snapshots is not None: + result = snapshots + else: + result = Snapshot.objects.all() + + if after is not None: + result = result.filter(timestamp__gte=after) + if before is not None: + result = result.filter(timestamp__lt=before) + if filter_patterns: + result = _apply_pattern_filters(result, filter_patterns, filter_type) + + # Prefetch crawl relationship to avoid N+1 queries when accessing output_dir + result = result.select_related("crawl", "crawl__created_by") + + if not result.exists(): + stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow") + + return result + + +@enforce_types +def search( + filter_patterns: list[str] | None = None, + filter_type: str = "substring", + status: str = "indexed", + before: float | None = None, + after: float | None = None, + sort: str | None = None, + json: bool = False, + html: bool = False, + csv: str | None = None, + with_headers: bool = False, +): + """List, filter, and export information about archive entries""" + + if with_headers and not (json or html or csv): + stderr("[X] --with-headers requires --json, --html or --csv\n", color="red") + raise SystemExit(2) + + # Query DB directly - no filesystem scanning + snapshots = get_snapshots( + filter_patterns=list(filter_patterns) if filter_patterns else None, + filter_type=filter_type, + before=before, + after=after, + ) + + # Apply status filter + if status == "archived": + snapshots = snapshots.filter(downloaded_at__isnull=False) + elif status == "unarchived": + snapshots = snapshots.filter(downloaded_at__isnull=True) + # 'indexed' = all snapshots (no filter) + + if sort: + snapshots = snapshots.order_by(sort) + + # Export to requested format + if json: + output = _snapshots_to_json(snapshots, with_headers=with_headers) + elif html: + output = _snapshots_to_html(snapshots, with_headers=with_headers) + elif csv: + output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers) + else: + from archivebox.misc.logging_util import printable_folders + + # Convert to dict for printable_folders + folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots} + output = printable_folders(folders, with_headers) + + # Structured exports must be written directly to stdout. + # rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output. + sys.stdout.write(output) + if not output.endswith("\n"): + sys.stdout.write("\n") + return output + + +@click.command() +@click.option( + "--filter-type", + "-f", + type=click.Choice(["search", *LINK_FILTERS.keys()]), + default="substring", + help="Pattern matching type for filtering URLs", +) +@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status") +@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp") +@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--json", "-J", is_flag=True, help="Print output in JSON format") +@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title") +@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output") +@click.help_option("--help", "-h") +@click.argument("filter_patterns", nargs=-1) +@docstring(search.__doc__) +def main(**kwargs): + return search(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 4cc050dd0b..861ce775f6 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -1,76 +1,235 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox server' +__package__ = "archivebox.cli" +from collections.abc import Iterable import sys -import argparse -from typing import Optional, List, IO +import rich_click as click +from rich import print -from ..main import server -from ..util import docstring -from ..config import OUTPUT_DIR, BIND_ADDR -from ..logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import docstring, enforce_types +from archivebox.config.common import SERVER_CONFIG -@docstring(server.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=server.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - 'runserver_args', - nargs='*', - type=str, - default=[BIND_ADDR], - help='Arguments to pass to Django runserver' - ) - parser.add_argument( - '--reload', - action='store_true', - help='Enable auto-reloading when code or templates change', - ) - parser.add_argument( - '--debug', - action='store_true', - help='Enable DEBUG=True mode with more verbose errors', - ) - parser.add_argument( - '--nothreading', - action='store_true', - help='Force runserver to run in single-threaded mode', + +def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int: + """Stop any existing orchestrator process so the server can take ownership.""" + process_model.cleanup_stale_running(machine=machine) + process_model.cleanup_orphaned_workers() + + running_runners = list( + process_model.objects.filter( + machine=machine, + status=process_model.StatusChoices.RUNNING, + process_type=process_model.TypeChoices.ORCHESTRATOR, + ).order_by("created_at"), ) - parser.add_argument( - '--init', - action='store_true', - help='Run a full archivebox init/upgrade before starting the server', + + if not running_runners: + return 0 + + log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]") + + if supervisor is not None and stop_worker_fn is not None: + for worker_name in ("worker_runner", "worker_runner_watch"): + try: + stop_worker_fn(supervisor, worker_name) + except Exception: + pass + + for proc in running_runners: + try: + proc.kill_tree(graceful_timeout=2.0) + except Exception: + try: + proc.terminate(graceful_timeout=2.0) + except Exception: + pass + + process_model.cleanup_stale_running(machine=machine) + return len(running_runners) + + +def _read_supervisor_worker_command(worker_name: str) -> str: + from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file + + worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf" + if not worker_conf.exists(): + return "" + + for line in worker_conf.read_text().splitlines(): + if line.startswith("command="): + return line.removeprefix("command=").strip() + return "" + + +def _worker_command_matches_bind(command: str, host: str, port: str) -> bool: + if not command: + return False + return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command) + + +def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int: + """Stop existing ArchiveBox web workers if they already own the requested bind.""" + stopped = 0 + + for worker_name in ("worker_runserver", "worker_daphne"): + try: + proc = supervisor.getProcessInfo(worker_name) if supervisor else None + except Exception: + proc = None + if not isinstance(proc, dict) or proc.get("statename") != "RUNNING": + continue + + command = _read_supervisor_worker_command(worker_name) + if not _worker_command_matches_bind(command, host, port): + continue + + if stopped == 0: + log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]") + stop_worker_fn(supervisor, worker_name) + stopped += 1 + + return stopped + + +@enforce_types +def server( + runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,), + reload: bool = False, + init: bool = False, + debug: bool = False, + daemonize: bool = False, + nothreading: bool = False, +) -> None: + """Run the ArchiveBox HTTP server""" + + runserver_args = list(runserver_args) + + if init: + from archivebox.cli.archivebox_init import init as archivebox_init + + archivebox_init(quick=True) + print() + + from archivebox.misc.checks import check_data_folder + + check_data_folder() + + from archivebox.config.common import SHELL_CONFIG + + run_in_debug = SHELL_CONFIG.DEBUG or debug or reload + if debug or reload: + SHELL_CONFIG.DEBUG = True + + from django.contrib.auth.models import User + + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): + print() + print( + "[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:", + ) + print(" [green]archivebox manage createsuperuser[/green]") + print() + + host = "127.0.0.1" + port = "8000" + + try: + host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0] + if ":" in host_and_port: + host, port = host_and_port.split(":") + else: + if "." in host_and_port: + host = host_and_port + else: + port = host_and_port + except IndexError: + pass + + from archivebox.workers.supervisord_util import ( + get_existing_supervisord_process, + get_worker, + stop_worker, + start_server_workers, + is_port_in_use, ) - parser.add_argument( - '--quick-init', '-i', - action='store_true', - help='Run quick archivebox init/upgrade before starting the server', + from archivebox.machine.models import Machine, Process + + machine = Machine.current() + supervisor = get_existing_supervisord_process() + stop_existing_background_runner( + machine=machine, + process_model=Process, + supervisor=supervisor, + stop_worker_fn=stop_worker, ) - parser.add_argument( - '--createsuperuser', - action='store_true', - help='Run archivebox manage createsuperuser before starting the server', + if supervisor: + stop_existing_server_workers( + supervisor=supervisor, + stop_worker_fn=stop_worker, + host=host, + port=port, + ) + + # Check if port is already in use + if is_port_in_use(host, int(port)): + print(f"[red][X] Error: Port {port} is already in use[/red]") + print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}") + print(" Stop the conflicting process or choose a different port") + sys.exit(1) + + supervisor = get_existing_supervisord_process() + if supervisor: + server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne" + server_proc = get_worker(supervisor, server_worker_name) + server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None + if server_state == "RUNNING": + runner_proc = get_worker(supervisor, "worker_runner") + runner_watch_proc = get_worker(supervisor, "worker_runner_watch") + runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None + runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None + print("[red][X] Error: ArchiveBox server is already running[/red]") + print( + f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]", + ) + if runner_state == "RUNNING": + print(" [green]√[/green] Background runner (worker_runner) is RUNNING") + if runner_watch_state == "RUNNING": + print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING") + print() + print("[yellow]To stop the existing server, run:[/yellow]") + print(' pkill -f "archivebox server"') + print(" pkill -f supervisord") + sys.exit(1) + + if run_in_debug: + print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]") + else: + print("[green][+] Starting ArchiveBox webserver...[/green]") + print( + f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]", ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - server( - runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []), - reload=command.reload, - debug=command.debug, - init=command.init, - quick_init=command.quick_init, - createsuperuser=command.createsuperuser, - out_dir=pwd or OUTPUT_DIR, + print( + f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]", ) + print(" > Writing ArchiveBox error log to ./logs/errors.log") + print() + start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading) + print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") + + +@click.command() +@click.argument("runserver_args", nargs=-1) +@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change") +@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors") +@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode") +@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server") +@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon") +@docstring(server.__doc__) +def main(**kwargs): + server(**kwargs) -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_setup.py b/archivebox/cli/archivebox_setup.py deleted file mode 100755 index 02ce57c999..0000000000 --- a/archivebox/cli/archivebox_setup.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' -__command__ = 'archivebox setup' - -import sys -import argparse - -from typing import Optional, List, IO - -from ..main import setup -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin - - -@docstring(setup.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=setup.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - # parser.add_argument( - # '--force', # '-f', - # action='store_true', - # help='Overwrite any existing packages that conflict with the ones ArchiveBox is trying to install', - # ) - command = parser.parse_args(args or ()) # noqa - reject_stdin(__command__, stdin) - - setup( - # force=command.force, - out_dir=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py index bcd5fdd6a2..26943d24ec 100644 --- a/archivebox/cli/archivebox_shell.py +++ b/archivebox/cli/archivebox_shell.py @@ -1,34 +1,28 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox shell' +__package__ = "archivebox.cli" -import sys -import argparse +from collections.abc import Iterable -from typing import Optional, List, IO +import rich_click as click -from ..main import shell -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import docstring +def shell(args: Iterable[str] = ()) -> None: + """Enter an interactive ArchiveBox Django shell""" + + from django.core.management import call_command + + call_command("shell_plus", *args) + + +@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) +@click.argument("args", nargs=-1) @docstring(shell.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=shell.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - shell( - out_dir=pwd or OUTPUT_DIR, - ) - - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +def main(args: Iterable[str] = ()) -> None: + shell(args=args) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py new file mode 100644 index 0000000000..e912654928 --- /dev/null +++ b/archivebox/cli/archivebox_snapshot.py @@ -0,0 +1,497 @@ +#!/usr/bin/env python3 + +""" +archivebox snapshot [args...] [--filters] + +Manage Snapshot records. + +Actions: + create - Create Snapshots from URLs or Crawl JSONL + list - List Snapshots as JSONL (with optional filters) + update - Update Snapshots from stdin JSONL + delete - Delete Snapshots from stdin JSONL + +Examples: + # Create + archivebox snapshot create https://example.com --tag=news + archivebox crawl create https://example.com | archivebox snapshot create + + # List with filters + archivebox snapshot list --status=queued + archivebox snapshot list --url__icontains=example.com + + # Update + archivebox snapshot list --tag=old | archivebox snapshot update --tag=new + + # Delete + archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox snapshot" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint +from django.db.models import Q, Sum +from django.db.models.functions import Coalesce + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_snapshots( + urls: Iterable[str], + tag: str = "", + status: str = "queued", + depth: int = 0, + created_by_id: int | None = None, +) -> int: + """ + Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). + Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import ( + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_CRAWL, + ) + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + created_by_id = created_by_id or get_or_create_system_user_pk() + is_tty = sys.stdout.isatty() + + # Collect all input records + records = list(read_args_or_stdin(urls)) + + if not records: + rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Process each record - handle Crawls and plain URLs/Snapshots + created_snapshots = [] + pass_through_count = 0 + + for record in records: + record_type = record.get("type", "") + + try: + if record_type == TYPE_CRAWL: + # Pass through the Crawl record itself first + if not is_tty: + write_record(record) + + # Input is a Crawl - get or create it, then create Snapshots for its URLs + crawl = None + crawl_id = record.get("id") + if crawl_id: + try: + crawl = Crawl.objects.get(id=crawl_id) + except Crawl.DoesNotExist: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + else: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + + if not crawl: + continue + + # Create snapshots for each URL in the crawl + for url in crawl.get_urls_list(): + merged_tags = crawl.tags_str + if tag: + merged_tags = f"{merged_tags},{tag}" if merged_tags else tag + snapshot_record = { + "url": url, + "tags": merged_tags, + "crawl_id": str(crawl.id), + "depth": depth, + "status": status, + } + snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id}) + if snapshot: + created_snapshots.append(snapshot) + if not is_tty: + write_record(snapshot.to_json()) + + elif record_type == TYPE_SNAPSHOT or record.get("url"): + # Input is a Snapshot or plain URL + if tag and not record.get("tags"): + record["tags"] = tag + if status: + record["status"] = status + record["depth"] = record.get("depth", depth) + + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + if snapshot: + created_snapshots.append(snapshot) + if not is_tty: + write_record(snapshot.to_json()) + + else: + # Pass-through: output records we don't handle + if not is_tty: + write_record(record) + pass_through_count += 1 + + except Exception as e: + rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr) + continue + + if not created_snapshots: + if pass_through_count > 0: + rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr) + return 0 + rprint("[red]No snapshots created[/red]", file=sys.stderr) + return 1 + + rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr) + + if is_tty: + for snapshot in created_snapshots: + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) + + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_snapshots( + status: str | None = None, + url__icontains: str | None = None, + url__istartswith: str | None = None, + tag: str | None = None, + crawl_id: str | None = None, + limit: int | None = None, + sort: str | None = None, + csv: str | None = None, + with_headers: bool = False, + search: str | None = None, + query: str | None = None, +) -> int: + """ + List Snapshots as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Snapshot + from archivebox.search import ( + get_default_search_mode, + get_search_mode, + prioritize_metadata_matches, + query_search_index, + ) + + if with_headers and not csv: + rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr) + return 2 + + is_tty = sys.stdout.isatty() and not csv + + queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at") + + # Apply filters + filter_kwargs = { + "status": status, + "url__icontains": url__icontains, + "url__istartswith": url__istartswith, + "crawl_id": crawl_id, + } + queryset = apply_filters(queryset, filter_kwargs) + + # Tag filter requires special handling (M2M) + if tag: + queryset = queryset.filter(tags__name__iexact=tag) + + query = (query or "").strip() + if query: + metadata_qs = queryset.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + requested_search_mode = (search or "").strip().lower() + if requested_search_mode == "content": + requested_search_mode = "contents" + search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode) + + if search_mode == "meta": + queryset = metadata_qs + else: + try: + deep_qsearch = None + if search_mode == "deep": + qsearch = query_search_index(query, search_mode="contents") + deep_qsearch = query_search_index(query, search_mode="deep") + else: + qsearch = query_search_index(query, search_mode=search_mode) + queryset = prioritize_metadata_matches( + queryset, + metadata_qs, + qsearch, + deep_queryset=deep_qsearch, + ordering=("-created_at",) if not sort else None, + ) + except Exception as err: + rprint( + f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]", + file=sys.stderr, + ) + queryset = metadata_qs + + if sort: + queryset = queryset.order_by(sort) + if limit: + queryset = queryset[:limit] + + count = 0 + if csv: + cols = [col.strip() for col in csv.split(",") if col.strip()] + if not cols: + rprint("[red]No CSV columns provided[/red]", file=sys.stderr) + return 2 + rows: list[str] = [] + if with_headers: + rows.append(",".join(cols)) + for snapshot in queryset.iterator(chunk_size=500): + rows.append(snapshot.to_csv(cols=cols, separator=",")) + count += 1 + output = "\n".join(rows) + if output: + sys.stdout.write(output) + if not output.endswith("\n"): + sys.stdout.write("\n") + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) + return 0 + + for snapshot in queryset: + if is_tty: + status_color = { + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(snapshot.status, "dim") + rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}") + else: + write_record(snapshot.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_snapshots( + status: str | None = None, + tag: str | None = None, +) -> int: + """ + Update Snapshots from stdin JSONL. + + Reads Snapshot records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Snapshot + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + snapshot_id = record.get("id") + if not snapshot_id: + continue + + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + # Apply updates from CLI flags (override stdin values) + if status: + snapshot.status = status + snapshot.retry_at = timezone.now() + if tag: + # Add tag to existing tags + snapshot.save() # Ensure saved before M2M + from archivebox.core.models import Tag + + tag_obj, _ = Tag.objects.get_or_create(name=tag) + snapshot.tags.add(tag_obj) + + snapshot.save() + updated_count += 1 + + if not is_tty: + write_record(snapshot.to_json()) + + except Snapshot.DoesNotExist: + rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Snapshots from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Snapshot + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + snapshot_ids = [r.get("id") for r in records if r.get("id")] + + if not snapshot_ids: + rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr) + return 1 + + snapshots = Snapshot.objects.filter(id__in=snapshot_ids) + count = snapshots.count() + + if count == 0: + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr) + for snapshot in snapshots: + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = snapshots.delete() + rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Snapshot records.""" + pass + + +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") +def create_cmd(urls: tuple, tag: str, status: str, depth: int): + """Create Snapshots from URLs or stdin JSONL.""" + sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) + + +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--url__icontains", help="Filter by URL contains") +@click.option("--url__istartswith", help="Filter by URL starts with") +@click.option("--tag", "-t", help="Filter by tag name") +@click.option("--crawl-id", help="Filter by crawl ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") +@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title") +@click.option("--with-headers", is_flag=True, help="Include column headers in structured output") +@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query") +@click.argument("query", nargs=-1) +def list_cmd( + status: str | None, + url__icontains: str | None, + url__istartswith: str | None, + tag: str | None, + crawl_id: str | None, + limit: int | None, + sort: str | None, + csv: str | None, + with_headers: bool, + search: str | None, + query: tuple[str, ...], +): + """List Snapshots as JSONL.""" + sys.exit( + list_snapshots( + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + csv=csv, + with_headers=with_headers, + search=search, + query=" ".join(query), + ), + ) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--tag", "-t", help="Add tag") +def update_cmd(status: str | None, tag: str | None): + """Update Snapshots from stdin JSONL.""" + sys.exit(update_snapshots(status=status, tag=tag)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Snapshots from stdin JSONL.""" + sys.exit(delete_snapshots(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_snapshot_compat.py b/archivebox/cli/archivebox_snapshot_compat.py new file mode 100644 index 0000000000..62f684e0aa --- /dev/null +++ b/archivebox/cli/archivebox_snapshot_compat.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox snapshot" + +import sys + +import rich_click as click + +from archivebox.cli.archivebox_snapshot import create_snapshots + + +@click.command(context_settings={"ignore_unknown_options": True}) +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") +@click.argument("urls", nargs=-1) +def main(tag: str, status: str, depth: int, urls: tuple[str, ...]): + """Backwards-compatible `archivebox snapshot URL...` entrypoint.""" + sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index 2bef19c7b4..b64ecddbf5 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -1,32 +1,139 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox status' +__package__ = "archivebox.cli" -import sys -import argparse +from pathlib import Path -from typing import Optional, List, IO +import rich_click as click +from rich import print -from ..main import status -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import enforce_types, docstring +from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR +from archivebox.config.common import SHELL_CONFIG +from archivebox.misc.legacy import parse_json_links_details +from archivebox.misc.system import get_dir_size +from archivebox.misc.logging_util import printable_filesize -@docstring(status.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=status.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.parse_args(args or ()) - reject_stdin(__command__, stdin) +@enforce_types +def status(out_dir: Path = DATA_DIR) -> None: + """Print out some info and statistics about the archive collection""" + + from django.contrib.auth import get_user_model + from django.db.models import Sum + from django.db.models.functions import Coalesce + from archivebox.core.models import Snapshot + + User = get_user_model() + + print("[green]\\[*] Scanning archive main index...[/green]") + print(f"[yellow] {out_dir}/*[/yellow]") + num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.") + size = printable_filesize(num_bytes) + print(f" Index size: {size} across {num_files} files") + print() + + links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))) + num_sql_links = len(links) + num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) + print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})") + print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)") + print() + print("[green]\\[*] Scanning archive data directories...[/green]") + users_dir = out_dir / "users" + scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()] + scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR) + print(f"[yellow] {scan_roots_display}[/yellow]") + num_bytes = num_dirs = num_files = 0 + for root in scan_roots: + root_bytes, root_dirs, root_files = get_dir_size(root) + num_bytes += root_bytes + num_dirs += root_dirs + num_files += root_files + size = printable_filesize(num_bytes) + print(f" Size: {size} across {num_files} files in {num_dirs} directories") + + # Use DB as source of truth for snapshot status + num_indexed = len(links) + num_archived = sum(1 for snapshot in links if snapshot.is_archived) + num_unarchived = max(num_indexed - num_archived, 0) + print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)") + print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)") + print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)") + + # Count snapshot directories on filesystem across both legacy and current layouts. + expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()} + discovered_snapshot_dirs = set() + + if ARCHIVE_DIR.exists(): + discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir()) + + if users_dir.exists(): + discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir()) - status(out_dir=pwd or OUTPUT_DIR) + orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs) + num_present = len(discovered_snapshot_dirs) + num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs) + print() + print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)") + print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)") + + num_orphaned = len(orphaned_dirs) + print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)") + + if num_indexed: + print(" [violet]Hint:[/violet] You can list snapshots by status like so:") + print(" [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]") + + if orphaned_dirs: + print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:") + print(" [green]archivebox init[/green]") + + print() + print("[green]\\[*] Scanning recent archive changes and user logins:[/green]") + print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]") + admin_users = User.objects.filter(is_superuser=True).exclude(username="system") + users = [user.get_username() for user in admin_users] + print(f" UI users {len(users)}: {', '.join(users)}") + last_login = admin_users.order_by("last_login").last() + if last_login: + print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}") + last_downloaded = Snapshot.objects.order_by("downloaded_at").last() + if last_downloaded: + print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}") + + if not users: + print() + print(" [violet]Hint:[/violet] You can create an admin user by running:") + print(" [green]archivebox manage createsuperuser[/green]") + + print() + recent_snapshots = sorted( + links, + key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at, + reverse=True, + )[:10] + for snapshot in recent_snapshots: + if not snapshot.downloaded_at: + continue + print( + ( + "[grey53] " + f" > {str(snapshot.downloaded_at)[:16]} " + f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] " + f'"{snapshot.title}": {snapshot.url}' + "[/grey53]" + )[: SHELL_CONFIG.TERM_WIDTH], + ) + print("[grey53] ...") + + +@click.command() +@docstring(status.__doc__) +def main(**kwargs): + """Print out some info and statistics about the archive collection""" + status(**kwargs) -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py new file mode 100644 index 0000000000..73352d5dc3 --- /dev/null +++ b/archivebox/cli/archivebox_tag.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +""" +archivebox tag [args...] [--filters] + +Manage Tag records. + +Actions: + create - Create Tags + list - List Tags as JSONL (with optional filters) + update - Update Tags from stdin JSONL + delete - Delete Tags from stdin JSONL + +Examples: + # Create + archivebox tag create news tech science + archivebox tag create "important stuff" + + # List + archivebox tag list + archivebox tag list --name__icontains=news + + # Update (rename tags) + archivebox tag list --name=oldname | archivebox tag update --name=newname + + # Delete + archivebox tag list --name=unused | archivebox tag delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox tag" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_utils import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_tags(names: Iterable[str]) -> int: + """ + Create Tags from names. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + # Convert to list if needed + name_list = list(names) if names else [] + + if not name_list: + rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr) + return 1 + + created_count = 0 + for name in name_list: + name = name.strip() + if not name: + continue + + tag, created = Tag.objects.get_or_create(name=name) + + if not is_tty: + write_record(tag.to_json()) + + if created: + created_count += 1 + rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr) + + rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_tags( + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Tags as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + queryset = Tag.objects.all().order_by("name") + + # Apply filters + filter_kwargs = { + "name": name, + "name__icontains": name__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for tag in queryset: + snapshot_count = tag.snapshot_set.count() + if is_tty: + rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]") + else: + write_record(tag.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_tags(name: str | None = None) -> int: + """ + Update Tags from stdin JSONL. + + Reads Tag records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + tag_id = record.get("id") + old_name = record.get("name") + + if not tag_id and not old_name: + continue + + try: + if tag_id: + tag = Tag.objects.get(id=tag_id) + else: + tag = Tag.objects.get(name=old_name) + + # Apply updates from CLI flags + if name: + tag.name = name + tag.save() + + updated_count += 1 + + if not is_tty: + write_record(tag.to_json()) + + except Tag.DoesNotExist: + rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_tags(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Tags from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Tag + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Collect tag IDs or names + tag_ids = [] + tag_names = [] + for r in records: + if r.get("id"): + tag_ids.append(r["id"]) + elif r.get("name"): + tag_names.append(r["name"]) + + if not tag_ids and not tag_names: + rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr) + return 1 + + from django.db.models import Q + + query = Q() + if tag_ids: + query |= Q(id__in=tag_ids) + if tag_names: + query |= Q(name__in=tag_names) + + tags = Tag.objects.filter(query) + count = tags.count() + + if count == 0: + rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr) + for tag in tags: + rprint(f" {tag.name}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = tags.delete() + rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Tag records.""" + pass + + +@main.command("create") +@click.argument("names", nargs=-1) +def create_cmd(names: tuple): + """Create Tags from names.""" + sys.exit(create_tags(names)) + + +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): + """List Tags as JSONL.""" + sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit)) + + +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): + """Update Tags from stdin JSONL.""" + sys.exit(update_tags(name=name)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Tags from stdin JSONL.""" + sys.exit(delete_tags(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 500d4c072b..659fcb976b 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -1,136 +1,600 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox update' - -import sys -import argparse - -from typing import List, Optional, IO - -from ..main import update -from ..util import docstring -from ..config import OUTPUT_DIR -from ..index import ( - LINK_FILTERS, - get_indexed_folders, - get_archived_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_invalid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) -from ..logging_util import SmartFormatter, accept_stdin +__package__ = "archivebox.cli" +import os +import time -@docstring(update.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=update.__doc__, - add_help=True, - formatter_class=SmartFormatter, - ) - parser.add_argument( - '--only-new', #'-n', - action='store_true', - help="Don't attempt to retry previously skipped/failed links when updating", - ) - parser.add_argument( - '--index-only', #'-o', - action='store_true', - help="Update the main index without archiving any content", - ) - parser.add_argument( - '--resume', #'-r', - type=float, - help='Resume the update process from a given timestamp', - default=None, - ) - parser.add_argument( - '--overwrite', #'-x', - action='store_true', - help='Ignore existing archived content and overwrite with new versions (DANGEROUS)', - ) - parser.add_argument( - '--before', #'-b', - type=float, - help="Update only links bookmarked before the given timestamp.", - default=None, +from typing import TYPE_CHECKING, Any +from collections.abc import Callable, Iterable +from pathlib import Path + +import rich_click as click +from django.core.exceptions import ObjectDoesNotExist +from django.db.models import Q, QuerySet + +from archivebox.misc.util import enforce_types, docstring + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + +LINK_FILTERS: dict[str, Callable[[str], Q]] = { + "exact": lambda pattern: Q(url=pattern), + "substring": lambda pattern: Q(url__icontains=pattern), + "regex": lambda pattern: Q(url__iregex=pattern), + "domain": lambda pattern: ( + Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}") + ), + "tag": lambda pattern: Q(tags__name=pattern), + "timestamp": lambda pattern: Q(timestamp=pattern), +} + + +def _apply_pattern_filters( + snapshots: QuerySet["Snapshot", "Snapshot"], + filter_patterns: list[str], + filter_type: str, +) -> QuerySet["Snapshot", "Snapshot"]: + filter_builder = LINK_FILTERS.get(filter_type) + if filter_builder is None: + raise SystemExit(2) + + query = Q() + for pattern in filter_patterns: + query |= filter_builder(pattern) + return snapshots.filter(query) + + +def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None": + try: + return snapshot.crawl + except ObjectDoesNotExist: + return None + + +def _get_search_indexing_plugins() -> list[str]: + from abx_dl.models import discover_plugins + from archivebox.hooks import get_search_backends + + available_backends = set(get_search_backends()) + plugins = discover_plugins() + return sorted( + plugin_name + for plugin_name, plugin in plugins.items() + if plugin_name.startswith("search_backend_") + and plugin_name.removeprefix("search_backend_") in available_backends + and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks) ) - parser.add_argument( - '--after', #'-a', - type=float, - help="Update only links bookmarked after the given timestamp.", - default=None, + + +def _build_filtered_snapshots_queryset( + *, + filter_patterns: Iterable[str], + filter_type: str, + before: float | None, + after: float | None, + resume: str | None = None, +): + from archivebox.core.models import Snapshot + from datetime import datetime + + snapshots = Snapshot.objects.all() + + if filter_patterns: + snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type) + + if before: + snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before)) + if after: + snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after)) + if resume: + snapshots = snapshots.filter(timestamp__lte=resume) + + return snapshots.select_related("crawl").order_by("-bookmarked_at") + + +def reindex_snapshots( + snapshots: QuerySet["Snapshot", "Snapshot"], + *, + search_plugins: list[str], + batch_size: int, +) -> dict[str, int]: + from archivebox.cli.archivebox_extract import run_plugins + + stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0} + records: list[dict[str, str]] = [] + + total = snapshots.count() + print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}") + + for snapshot in snapshots.iterator(chunk_size=batch_size): + stats["processed"] += 1 + + if _get_snapshot_crawl(snapshot) is None: + continue + + output_dir = Path(snapshot.output_dir) + has_directory = output_dir.exists() and output_dir.is_dir() + if has_directory: + snapshot.reconcile_with_index_json() + stats["reconciled"] += 1 + + for plugin_name in search_plugins: + existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first() + if existing_result: + existing_result.reset_for_retry() + records.append( + { + "type": "ArchiveResult", + "snapshot_id": str(snapshot.id), + "plugin": plugin_name, + }, + ) + stats["queued"] += 1 + + if not records: + return stats + + exit_code = run_plugins( + args=(), + records=records, + wait=True, + emit_results=False, ) - parser.add_argument( - '--status', - type=str, - choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'), - default='indexed', - help=( - 'Update only links or data directories that have the given status\n' - f' indexed {get_indexed_folders.__doc__} (the default)\n' - f' archived {get_archived_folders.__doc__}\n' - f' unarchived {get_unarchived_folders.__doc__}\n' - '\n' - f' present {get_present_folders.__doc__}\n' - f' valid {get_valid_folders.__doc__}\n' - f' invalid {get_invalid_folders.__doc__}\n' - '\n' - f' duplicate {get_duplicate_folders.__doc__}\n' - f' orphaned {get_orphaned_folders.__doc__}\n' - f' corrupted {get_corrupted_folders.__doc__}\n' - f' unrecognized {get_unrecognized_folders.__doc__}\n' + if exit_code != 0: + raise SystemExit(exit_code) + + stats["reindexed"] = len(records) + return stats + + +@enforce_types +def update( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + before: float | None = None, + after: float | None = None, + resume: str | None = None, + batch_size: int = 100, + continuous: bool = False, + index_only: bool = False, +) -> None: + """ + Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving. + + Three-phase operation (without filters): + - Phase 1: Drain old archive/ dirs by moving to new fs location (0.8.x → 0.9.x) + - Phase 2: O(n) scan over entire DB from most recent to least recent + - No orphan scans needed (trust 1:1 mapping between DB and filesystem after phase 1) + + With filters: Only phase 2 (DB query), no filesystem operations. + Without filters: All phases (full update). + """ + + from rich import print + from archivebox.config.django import setup_django + + setup_django() + + from django.core.management import call_command + + # Run migrations first to ensure DB schema is up-to-date + print("[*] Checking for pending migrations...") + try: + call_command("migrate", "--no-input", verbosity=0) + except Exception as e: + print(f"[!] Warning: Migration check failed: {e}") + + while True: + if index_only: + search_plugins = _get_search_indexing_plugins() + if not search_plugins: + print("[*] No search indexing plugins are available, nothing to backfill.") + break + + if not (filter_patterns or before or after): + print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...") + drain_old_archive_dirs( + resume_from=resume, + batch_size=batch_size, + ) + + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + resume=resume, + ) + stats = reindex_snapshots( + snapshots, + search_plugins=search_plugins, + batch_size=batch_size, + ) + print_index_stats(stats) + elif filter_patterns or before or after: + # Filtered mode: query DB only + print("[*] Processing filtered snapshots from database...") + stats = process_filtered_snapshots( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + resume=resume, + batch_size=batch_size, + ) + print_stats(stats) + else: + # Full mode: drain old dirs + process DB + stats_combined = {"phase1": {}, "phase2": {}} + + print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...") + stats_combined["phase1"] = drain_old_archive_dirs( + resume_from=resume, + batch_size=batch_size, + ) + + print("[*] Phase 2: Processing all database snapshots (most recent first)...") + stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume) + + # Phase 3: Deduplication (disabled for now) + # print('[*] Phase 3: Deduplicating...') + # stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates() + + print_combined_stats(stats_combined) + + if not continuous: + break + + print("[yellow]Sleeping 60s before next pass...[/yellow]") + time.sleep(60) + resume = None + + +def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100) -> dict[str, int]: + """ + Drain old archive/ directories (0.8.x → 0.9.x migration). + + Only processes real directories (skips symlinks - those are already migrated). + For each old dir found in archive/: + 1. Load or create DB snapshot + 2. Trigger fs migration on save() to move to data/users/{user}/... + 3. Leave symlink in archive/ pointing to new location + + After this drains, archive/ should only contain symlinks and we can trust + 1:1 mapping between DB and filesystem. + """ + from archivebox.core.models import Snapshot + from archivebox.config import CONSTANTS + from django.db import transaction + + stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0} + + archive_dir = CONSTANTS.ARCHIVE_DIR + if not archive_dir.exists(): + return stats + + print("[DEBUG Phase1] Scanning for old directories in archive/...") + + # Scan for real directories only (skip symlinks - they're already migrated) + all_entries = list(os.scandir(archive_dir)) + print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}") + entries = [ + (e.stat().st_mtime, e.path) + for e in all_entries + if e.is_dir(follow_symlinks=False) # Skip symlinks + ] + entries.sort(reverse=True) # Newest first + print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}") + print(f"[*] Found {len(entries)} old directories to drain") + + for mtime, entry_path in entries: + entry_path = Path(entry_path) + + # Resume from timestamp if specified + if resume_from and entry_path.name > resume_from: + continue + + stats["processed"] += 1 + + # Try to load existing snapshot from DB + snapshot = Snapshot.load_from_directory(entry_path) + + if not snapshot: + # Not in DB - create new snapshot record + snapshot = Snapshot.create_from_directory(entry_path) + if not snapshot: + # Invalid directory - move to invalid/ + Snapshot.move_directory_to_invalid(entry_path) + stats["invalid"] += 1 + print(f" [{stats['processed']}] Invalid: {entry_path.name}") + continue + + try: + snapshot.save() + stats["migrated"] += 1 + print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}") + except Exception as e: + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") + continue + + # Ensure snapshot has a valid crawl (migration 0024 may have failed) + has_valid_crawl = _get_snapshot_crawl(snapshot) is not None + + if not has_valid_crawl: + # Create a new crawl (created_by will default to system user) + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create(urls=snapshot.url) + # Use queryset update to avoid triggering save() hooks + from archivebox.core.models import Snapshot as SnapshotModel + + SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl) + # Refresh the instance + snapshot.crawl = crawl + print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}") + + # Check if needs migration (0.8.x → 0.9.x) + print( + f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}", ) + if snapshot.fs_migration_needed: + try: + # Calculate paths using actual directory (entry_path), not snapshot.timestamp + # because snapshot.timestamp might be truncated + old_dir = entry_path + new_dir = snapshot.get_storage_path_for_version("0.9.0") + print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}") + + # Manually migrate files + if not new_dir.exists() and old_dir.exists(): + new_dir.mkdir(parents=True, exist_ok=True) + import shutil + + file_count = 0 + for old_file in old_dir.rglob("*"): + if old_file.is_file(): + rel_path = old_file.relative_to(old_dir) + new_file = new_dir / rel_path + if not new_file.exists(): + new_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(old_file, new_file) + file_count += 1 + print(f"[DEBUG Phase1] Copied {file_count} files") + + # Update only fs_version field using queryset update (bypasses validation) + from archivebox.core.models import Snapshot as SnapshotModel + + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0") + + # Commit the transaction + transaction.commit() + + # Cleanup: delete old dir and create symlink + if old_dir.exists() and old_dir != new_dir: + snapshot._cleanup_old_migration_dir(old_dir, new_dir) + + stats["migrated"] += 1 + print(f" [{stats['processed']}] Migrated: {entry_path.name}") + except Exception as e: + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") + else: + stats["skipped"] += 1 + + if stats["processed"] % batch_size == 0: + transaction.commit() + + transaction.commit() + return stats + + +def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]: + """ + O(n) scan over entire DB from most recent to least recent. + + For each snapshot: + 1. Reconcile index.json with DB (merge titles, tags, archive results) + 2. Queue for archiving (state machine will handle it) + + No orphan detection needed - we trust 1:1 mapping between DB and filesystem + after Phase 1 has drained all old archive/ directories. + """ + from archivebox.core.models import Snapshot + from django.db import transaction + from django.utils import timezone + + stats = {"processed": 0, "reconciled": 0, "queued": 0} + + queryset = Snapshot.objects.all() + if resume: + queryset = queryset.filter(timestamp__lte=resume) + total = queryset.count() + print(f"[*] Processing {total} snapshots from database (most recent first)...") + + # Process from most recent to least recent + for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size): + stats["processed"] += 1 + + # Skip snapshots with missing crawl references (orphaned by migration errors) + if _get_snapshot_crawl(snapshot) is None: + continue + + try: + print( + f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}", + ) + + # Check if snapshot has a directory on disk + from pathlib import Path + + output_dir = Path(snapshot.output_dir) + has_directory = output_dir.exists() and output_dir.is_dir() + + # Only reconcile if directory exists (don't create empty directories for orphans) + if has_directory: + snapshot.reconcile_with_index_json() + + # Clean up invalid field values from old migrations + if not isinstance(snapshot.current_step, int): + snapshot.current_step = 0 + + # If still needs migration, it's an orphan (no directory on disk) + # Mark it as migrated to prevent save() from triggering filesystem migration + if snapshot.fs_migration_needed: + if has_directory: + print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration") + else: + print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation") + # Use queryset update to set fs_version without triggering save() hooks + from archivebox.core.models import Snapshot as SnapshotModel + + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0") + snapshot.fs_version = "0.9.0" + + # Queue for archiving (state machine will handle it) + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save() + + stats["reconciled"] += 1 if has_directory else 0 + stats["queued"] += 1 + except Exception as e: + # Skip snapshots that can't be processed (e.g., missing crawl) + print(f" [!] Skipping snapshot {snapshot.id}: {e}") + continue + + if stats["processed"] % batch_size == 0: + transaction.commit() + print(f" [{stats['processed']}/{total}] Processed...") + + transaction.commit() + return stats + + +def process_filtered_snapshots( + filter_patterns: Iterable[str], + filter_type: str, + before: float | None, + after: float | None, + resume: str | None, + batch_size: int, +) -> dict[str, int]: + """Process snapshots matching filters (DB query only).""" + from django.db import transaction + from django.utils import timezone + + stats = {"processed": 0, "reconciled": 0, "queued": 0} + + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + resume=resume, ) - parser.add_argument( - '--filter-type', '-t', - type=str, - choices=(*LINK_FILTERS.keys(), 'search'), - default='exact', - help='Type of pattern matching to use when filtering URLs', - ) - parser.add_argument( - 'filter_patterns', - nargs='*', - type=str, - default=None, - help='Update only URLs matching these filter patterns.' - ) - parser.add_argument( - "--extract", - type=str, - help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration", - default="" - ) - command = parser.parse_args(args or ()) - - filter_patterns_str = None - if not command.filter_patterns: - filter_patterns_str = accept_stdin(stdin) - - update( - resume=command.resume, - only_new=command.only_new, - index_only=command.index_only, - overwrite=command.overwrite, - filter_patterns_str=filter_patterns_str, - filter_patterns=command.filter_patterns, - filter_type=command.filter_type, - status=command.status, - after=command.after, - before=command.before, - out_dir=pwd or OUTPUT_DIR, - extractors=command.extract, - ) - -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) + total = snapshots.count() + print(f"[*] Found {total} matching snapshots") + + for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size): + stats["processed"] += 1 + + # Skip snapshots with missing crawl references + if _get_snapshot_crawl(snapshot) is None: + continue + + try: + # Reconcile index.json with DB + snapshot.reconcile_with_index_json() + + # Clean up invalid field values from old migrations + if not isinstance(snapshot.current_step, int): + snapshot.current_step = 0 + + # Queue for archiving + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save() + + stats["reconciled"] += 1 + stats["queued"] += 1 + except Exception as e: + # Skip snapshots that can't be processed + print(f" [!] Skipping snapshot {snapshot.id}: {e}") + continue + + if stats["processed"] % batch_size == 0: + transaction.commit() + print(f" [{stats['processed']}/{total}] Processed...") + + transaction.commit() + return stats + + +def print_stats(stats: dict): + """Print statistics for filtered mode.""" + from rich import print + + print(f""" +[green]Update Complete[/green] + Processed: {stats["processed"]} + Reconciled: {stats["reconciled"]} + Queued: {stats["queued"]} +""") + + +def print_combined_stats(stats_combined: dict): + """Print statistics for full mode.""" + from rich import print + + s1 = stats_combined["phase1"] + s2 = stats_combined["phase2"] + + print(f""" +[green]Archive Update Complete[/green] + +Phase 1 (Drain Old Dirs): + Checked: {s1.get("processed", 0)} + Migrated: {s1.get("migrated", 0)} + Skipped: {s1.get("skipped", 0)} + Invalid: {s1.get("invalid", 0)} + +Phase 2 (Process DB): + Processed: {s2.get("processed", 0)} + Reconciled: {s2.get("reconciled", 0)} + Queued: {s2.get("queued", 0)} +""") + + +def print_index_stats(stats: dict[str, Any]) -> None: + from rich import print + + print(f""" +[green]Search Reindex Complete[/green] + Processed: {stats["processed"]} + Reconciled: {stats["reconciled"]} + Queued: {stats["queued"]} + Reindexed: {stats["reindexed"]} +""") + + +@click.command() +@click.option("--resume", type=str, help="Resume from timestamp") +@click.option("--before", type=float, help="Only snapshots before timestamp") +@click.option("--after", type=float, help="Only snapshots after timestamp") +@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact") +@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots") +@click.option("--continuous", is_flag=True, help="Run continuously as background worker") +@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content") +@click.argument("filter_patterns", nargs=-1) +@docstring(update.__doc__) +def main(**kwargs): + update(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index e7922f37c7..cd088079e9 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -1,40 +1,337 @@ #!/usr/bin/env python3 -__package__ = 'archivebox.cli' -__command__ = 'archivebox version' +__package__ = "archivebox.cli" import sys -import argparse +import os +import platform +from pathlib import Path +from collections.abc import Iterable -from typing import Optional, List, IO +import rich_click as click -from ..main import version -from ..util import docstring -from ..config import OUTPUT_DIR -from ..logging_util import SmartFormatter, reject_stdin +from archivebox.misc.util import docstring, enforce_types -@docstring(version.__doc__) -def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None: - parser = argparse.ArgumentParser( - prog=__command__, - description=version.__doc__, - add_help=True, - formatter_class=SmartFormatter, +def _format_binary_abspath( + abspath: str, + *, + pwd: Path, + lib_dir: Path, + personas_dir: Path, + home: Path, +) -> str: + path = Path(abspath).expanduser() + try: + normalized = path.resolve(strict=False) + except Exception: + normalized = path + + machine = platform.machine().lower() + system = platform.system().lower() + arch_scope = f"{machine}-{system}" + + candidate_bases: tuple[tuple[Path, str], ...] = ( + (pwd, "./"), + (lib_dir, "LIB_DIR/"), + (Path(os.environ.get("LIB_DIR", "")), "LIB_DIR/") if os.environ.get("LIB_DIR") else (Path(), ""), + (personas_dir, "PERSONAS_DIR/"), + (Path(os.environ.get("PERSONAS_DIR", "")), "PERSONAS_DIR/") if os.environ.get("PERSONAS_DIR") else (Path(), ""), + (home / ".config" / "abx" / "lib" / arch_scope, "LIB_DIR/"), + (home / ".config" / "abx" / "lib", "LIB_DIR/"), + (home / ".config" / "abx" / "personas", "PERSONAS_DIR/"), + (home, "~/"), ) - parser.add_argument( - '--quiet', '-q', - action='store_true', - help='Only print ArchiveBox version number and nothing else.', + + for base, prefix in candidate_bases: + if not prefix: + continue + for candidate in (base, base.resolve(strict=False)): + try: + relative = normalized.relative_to(candidate) + except ValueError: + continue + + relative_str = relative.as_posix() + if prefix == "./": + return "." if not relative_str else f"./{relative_str}" + if prefix == "LIB_DIR/": + return "LIB_DIR" if not relative_str else f"LIB_DIR/{relative_str}" + if prefix == "PERSONAS_DIR/": + return "PERSONAS_DIR" if not relative_str else f"PERSONAS_DIR/{relative_str}" + return "~" if not relative_str else f"~/{relative_str}" + + return normalized.as_posix() + + +def _render_binary_abspath(abspath: str): + from rich.text import Text + + if abspath.startswith("LIB_DIR/"): + return Text.assemble(("LIB_DIR", "bright_blue"), (abspath.removeprefix("LIB_DIR"), "green")) + if abspath == "LIB_DIR": + return Text("LIB_DIR", style="bright_blue") + if abspath.startswith("PERSONAS_DIR/"): + return Text.assemble(("PERSONAS_DIR", "medium_purple"), (abspath.removeprefix("PERSONAS_DIR"), "green")) + if abspath == "PERSONAS_DIR": + return Text("PERSONAS_DIR", style="medium_purple") + if abspath.startswith("~/"): + return Text.assemble(("~", "cyan"), (abspath.removeprefix("~"), "green")) + if abspath == "~": + return Text("~", style="cyan") + if abspath.startswith("./"): + return Text.assemble((".", "cyan"), (abspath.removeprefix("."), "green")) + if abspath == ".": + return Text(".", style="cyan") + return Text(abspath, style="green") + + +@enforce_types +def version( + quiet: bool = False, + binaries: Iterable[str] = (), +) -> list[str]: + """Print the ArchiveBox version, debug metadata, and installed dependency versions""" + + # fast path for just getting the version and exiting, dont do any slower imports + from archivebox.config.version import VERSION + + print(VERSION) + if quiet or "--version" in sys.argv: + return [] + + from rich.panel import Panel + from rich.console import Console + + from archivebox.config import CONSTANTS + from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER + from archivebox.config.paths import get_data_locations, get_code_locations + from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG + from archivebox.misc.logging_util import printable_folder_status + from archivebox.config.configset import get_config + + console = Console() + prnt = console.print + + # Check if LDAP is enabled (simple config lookup) + config = get_config() + LDAP_ENABLED = config.get("LDAP_ENABLED", False) + + p = platform.uname() + COMMIT_HASH = get_COMMIT_HASH() + prnt( + f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]", + f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}", + f"BUILD_TIME={get_BUILD_TIME()}", ) - command = parser.parse_args(args or ()) - reject_stdin(__command__, stdin) - - version( - quiet=command.quiet, - out_dir=pwd or OUTPUT_DIR, + prnt( + f"IN_DOCKER={IN_DOCKER}", + f"IN_QEMU={SHELL_CONFIG.IN_QEMU}", + f"ARCH={p.machine}", + f"OS={p.system}", + f"PLATFORM={platform.platform()}", + f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""), ) + try: + OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount + except Exception: + OUTPUT_IS_REMOTE_FS = False + + try: + DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() + prnt( + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", + f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}", + f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}", + f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}", + f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}", + ) + except Exception: + prnt( + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", + ) + + prnt( + f"DEBUG={SHELL_CONFIG.DEBUG}", + f"IS_TTY={SHELL_CONFIG.IS_TTY}", + f"SUDO={CONSTANTS.IS_ROOT}", + f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}", + f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}", + f"LDAP={LDAP_ENABLED}", + ) + prnt() + + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): + PANEL_TEXT = "\n".join( + ( + "", + "[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...", + " [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.", + "", + " [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]", + "", + ), + ) + prnt( + Panel( + PANEL_TEXT, + expand=False, + border_style="grey53", + title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]", + subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) + prnt() + return [] + + prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]") + failures = [] + + # Setup Django before importing models + try: + from archivebox.config.django import setup_django + + setup_django() + + from archivebox.machine.models import Machine, Binary + + machine = Machine.current() + + if isinstance(binaries, str): + requested_names = {name.strip() for name in binaries.split(",") if name.strip()} + else: + requested_names = {name for name in (binaries or ()) if name} + + db_binaries: dict[str, Binary] = {} + for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at"): + db_binaries.setdefault(binary.name, binary) + + all_binary_names = sorted(requested_names or set(db_binaries.keys())) + + if not all_binary_names: + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") + else: + any_available = False + compact_paths = console.is_terminal + for name in all_binary_names: + if requested_names and name not in requested_names: + continue + + installed = db_binaries.get(name) + if installed and installed.is_valid: + display_name = Path(name).expanduser().name if ("/" in name or name.startswith("~")) else name + display_path = ( + _format_binary_abspath( + installed.abspath, + pwd=Path.cwd(), + lib_dir=STORAGE_CONFIG.LIB_DIR, + personas_dir=Path.home() / ".config" / "abx" / "personas", + home=Path.home(), + ) + if compact_paths + else installed.abspath + ) + rendered_path = _render_binary_abspath(display_path) if compact_paths else display_path + version_str = (installed.version or "unknown")[:15] + provider = (installed.binprovider or "env")[:8] + prnt( + "", + "[green]√[/green]", + "", + display_name.ljust(18), + version_str.ljust(16), + provider.ljust(8), + rendered_path, + overflow="ignore", + crop=False, + ) + any_available = True + continue + + status = ( + "[grey53]not recorded[/grey53]" if name in requested_names and installed is None else "[grey53]not installed[/grey53]" + ) + prnt("", "[red]X[/red]", "", name.ljust(18), status, overflow="ignore", crop=False) + failures.append(name) + + if not any_available: + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") + + # Show hint if no binaries are installed yet + has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists() + if not has_any_installed: + prnt() + prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]") + + except Exception as e: + # Handle database errors gracefully (locked, missing, etc.) + prnt() + prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]") + prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]") + + if not binaries: + # Show code and data locations + prnt() + prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]") + try: + for name, path in get_code_locations().items(): + if isinstance(name, str) and isinstance(path, dict): + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) + except Exception as e: + prnt(f" [red]Error getting code locations: {e}[/red]") + + prnt() + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): + prnt("[bright_yellow][i] Data locations:[/bright_yellow]") + try: + for name, path in get_data_locations().items(): + if isinstance(name, str) and isinstance(path, dict): + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) + except Exception as e: + prnt(f" [red]Error getting data locations: {e}[/red]") + + try: + from archivebox.misc.checks import check_data_dir_permissions + + check_data_dir_permissions() + except Exception: + pass + else: + prnt() + prnt("[red][i] Data locations:[/red] (not in a data directory)") + + prnt() + + if failures: + prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]") + prnt(f" [red]{', '.join(failures)}[/red]") + prnt() + prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:") + prnt(" [green]archivebox install[/green]") + prnt() + return failures + + +@click.command() +@click.option( + "--quiet", + "-q", + is_flag=True, + help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)", +) +@click.option( + "--binaries", + "-b", + help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)", +) +@docstring(version.__doc__) +def main(**kwargs): + failures = version(**kwargs) + if failures: + raise SystemExit(1) + -if __name__ == '__main__': - main(args=sys.argv[1:], stdin=sys.stdin) +if __name__ == "__main__": + main() diff --git a/archivebox/cli/cli_utils.py b/archivebox/cli/cli_utils.py new file mode 100644 index 0000000000..799624e2ea --- /dev/null +++ b/archivebox/cli/cli_utils.py @@ -0,0 +1,44 @@ +""" +Shared CLI utilities for ArchiveBox commands. + +This module contains common utilities used across multiple CLI commands, +extracted to avoid code duplication. +""" + +__package__ = "archivebox.cli" + + +def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None): + """ + Apply Django-style filters from CLI kwargs to a QuerySet. + + Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2 + + Args: + queryset: Django QuerySet to filter + filter_kwargs: Dict of filter key-value pairs from CLI + limit: Optional limit on results + + Returns: + Filtered QuerySet + + Example: + queryset = Snapshot.objects.all() + filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'} + filtered = apply_filters(queryset, filter_kwargs, limit=10) + """ + filters = {} + for key, value in filter_kwargs.items(): + if value is None or key in ("limit", "offset"): + continue + # Handle CSV lists for __in filters + if key.endswith("__in") and isinstance(value, str): + value = [v.strip() for v in value.split(",")] + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + if limit: + queryset = queryset[:limit] + + return queryset diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py deleted file mode 100644 index 04c54df8ad..0000000000 --- a/archivebox/cli/tests.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' - - -import os -import sys -import shutil -import unittest -from pathlib import Path - -from contextlib import contextmanager - -TEST_CONFIG = { - 'USE_COLOR': 'False', - 'SHOW_PROGRESS': 'False', - - 'OUTPUT_DIR': 'data.tests', - - 'SAVE_ARCHIVE_DOT_ORG': 'False', - 'SAVE_TITLE': 'False', - - 'USE_CURL': 'False', - 'USE_WGET': 'False', - 'USE_GIT': 'False', - 'USE_CHROME': 'False', - 'USE_YOUTUBEDL': 'False', -} - -OUTPUT_DIR = 'data.tests' -os.environ.update(TEST_CONFIG) - -from ..main import init -from ..index import load_main_index -from ..config import ( - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, -) - -from . import ( - archivebox_init, - archivebox_add, - archivebox_remove, -) - -HIDE_CLI_OUTPUT = True - -test_urls = ''' -https://example1.com/what/is/happening.html?what=1#how-about-this=1 -https://example2.com/what/is/happening/?what=1#how-about-this=1 -HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f -https://example4.com/what/is/happening.html -https://example5.com/ -https://example6.com - -http://example7.com -[https://example8.com/what/is/this.php?what=1] -[and http://example9.com?what=1&other=3#and-thing=2] -https://example10.com#and-thing=2 " -abcdef -sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi -example13.bada -and example14.badb -htt://example15.badc -''' - -stdout = sys.stdout -stderr = sys.stderr - - -@contextmanager -def output_hidden(show_failing=True): - if not HIDE_CLI_OUTPUT: - yield - return - - sys.stdout = open('stdout.txt', 'w+', encoding='utf-8') - sys.stderr = open('stderr.txt', 'w+', encoding='utf-8') - try: - yield - sys.stdout.close() - sys.stderr.close() - sys.stdout = stdout - sys.stderr = stderr - except Exception: - sys.stdout.close() - sys.stderr.close() - sys.stdout = stdout - sys.stderr = stderr - if show_failing: - with open('stdout.txt', 'r', encoding='utf-8') as f: - print(f.read()) - with open('stderr.txt', 'r', encoding='utf-8') as f: - print(f.read()) - raise - finally: - os.remove('stdout.txt') - os.remove('stderr.txt') - - -class TestInit(unittest.TestCase): - def setUp(self): - os.makedirs(OUTPUT_DIR, exist_ok=True) - - def tearDown(self): - shutil.rmtree(OUTPUT_DIR, ignore_errors=True) - - def test_basic_init(self): - with output_hidden(): - archivebox_init.main([]) - - assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() - assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() - assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() - assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0 - - def test_conflicting_init(self): - with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f: - f.write('test') - - try: - with output_hidden(show_failing=False): - archivebox_init.main([]) - assert False, 'Init should have exited with an exception' - except SystemExit: - pass - - assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists() - assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists() - assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists() - try: - load_main_index(out_dir=OUTPUT_DIR) - assert False, 'load_main_index should raise an exception when no index is present' - except Exception: - pass - - def test_no_dirty_state(self): - with output_hidden(): - init() - shutil.rmtree(OUTPUT_DIR, ignore_errors=True) - with output_hidden(): - init() - - -class TestAdd(unittest.TestCase): - def setUp(self): - os.makedirs(OUTPUT_DIR, exist_ok=True) - with output_hidden(): - init() - - def tearDown(self): - shutil.rmtree(OUTPUT_DIR, ignore_errors=True) - - def test_add_arg_url(self): - with output_hidden(): - archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all']) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 30 - - def test_add_arg_file(self): - test_file = Path(OUTPUT_DIR) / 'test.txt' - with open(test_file, 'w+', encoding='utf') as f: - f.write(test_urls) - - with output_hidden(): - archivebox_add.main([test_file]) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 12 - os.remove(test_file) - - def test_add_stdin_url(self): - with output_hidden(): - archivebox_add.main([], stdin=test_urls) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 12 - - -class TestRemove(unittest.TestCase): - def setUp(self): - os.makedirs(OUTPUT_DIR, exist_ok=True) - with output_hidden(): - init() - archivebox_add.main([], stdin=test_urls) - - # def tearDown(self): - # shutil.rmtree(OUTPUT_DIR, ignore_errors=True) - - - def test_remove_exact(self): - with output_hidden(): - archivebox_remove.main(['--yes', '--delete', 'https://example5.com/']) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 11 - - def test_remove_regex(self): - with output_hidden(): - archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)']) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 4 - - def test_remove_domain(self): - with output_hidden(): - archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com']) - - all_links = load_main_index(out_dir=OUTPUT_DIR) - assert len(all_links) == 10 - - def test_remove_none(self): - try: - with output_hidden(show_failing=False): - archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com']) - assert False, 'Should raise if no URLs match' - except Exception: - pass - - -if __name__ == '__main__': - if '--verbose' in sys.argv or '-v' in sys.argv: - HIDE_CLI_OUTPUT = False - - unittest.main() diff --git a/archivebox/config.py b/archivebox/config.py deleted file mode 100644 index a84f70b9cb..0000000000 --- a/archivebox/config.py +++ /dev/null @@ -1,1176 +0,0 @@ -""" -ArchiveBox config definitons (including defaults and dynamic config options). - -Config Usage Example: - - archivebox config --set MEDIA_TIMEOUT=600 - env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ... - -Config Precedence Order: - - 1. cli args (--update-all / --index-only / etc.) - 2. shell environment vars (env USE_COLOR=False archivebox add '...') - 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf) - 4. defaults (defined below in Python) - -Documentation: - - https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration - -""" - -__package__ = 'archivebox' - -import os -import io -import re -import sys -import json -import getpass -import platform -import shutil -import sqlite3 -import django - -from hashlib import md5 -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional, Type, Tuple, Dict, Union, List -from subprocess import run, PIPE, DEVNULL -from configparser import ConfigParser -from collections import defaultdict - -from .config_stubs import ( - SimpleConfigValueDict, - ConfigValue, - ConfigDict, - ConfigDefaultValue, - ConfigDefaultDict, -) - -############################### Config Schema ################################## - -CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { - 'SHELL_CONFIG': { - 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()}, - 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']}, - 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now - 'IN_DOCKER': {'type': bool, 'default': False}, - # TODO: 'SHOW_HINTS': {'type: bool, 'default': True}, - }, - - 'GENERAL_CONFIG': { - 'OUTPUT_DIR': {'type': str, 'default': None}, - 'CONFIG_FILE': {'type': str, 'default': None}, - 'ONLY_NEW': {'type': bool, 'default': True}, - 'TIMEOUT': {'type': int, 'default': 60}, - 'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, - 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'}, - 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, - 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages - }, - - 'SERVER_CONFIG': { - 'SECRET_KEY': {'type': str, 'default': None}, - 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]}, - 'ALLOWED_HOSTS': {'type': str, 'default': '*'}, - 'DEBUG': {'type': bool, 'default': False}, - 'PUBLIC_INDEX': {'type': bool, 'default': True}, - 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, - 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, - 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, - 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, - 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, - 'TIME_ZONE': {'type': str, 'default': 'UTC'}, - }, - - 'ARCHIVE_METHOD_TOGGLES': { - 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)}, - 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, - 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, - 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, - 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, - 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, - 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, - 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, - 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, - 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, - 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, - 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, - 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, - 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, - 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, - }, - - 'ARCHIVE_METHOD_OPTIONS': { - 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)}, - 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'}, - 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'}, - - 'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, - 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'}, - - 'COOKIES_FILE': {'type': str, 'default': None}, - 'CHROME_USER_DATA_DIR': {'type': str, 'default': None}, - - 'CHROME_HEADLESS': {'type': bool, 'default': True}, - 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']}, - 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [ - '--write-description', - '--write-info-json', - '--write-annotations', - '--write-thumbnail', - '--no-call-home', - '--write-sub', - '--all-subs', - '--write-auto-sub', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), - ]}, - - - 'WGET_ARGS': {'type': list, 'default': ['--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ]}, - 'CURL_ARGS': {'type': list, 'default': ['--silent', - '--location', - '--compressed' - ]}, - 'GIT_ARGS': {'type': list, 'default': ['--recursive']}, - }, - - 'SEARCH_BACKEND_CONFIG' : { - 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, - 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, - 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, - 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, - 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, - 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, - # SONIC - 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, - 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, - 'SEARCH_BACKEND_TIMEOUT': {'type': int, 'default': 90}, - }, - - 'DEPENDENCY_CONFIG': { - 'USE_CURL': {'type': bool, 'default': True}, - 'USE_WGET': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, - 'USE_READABILITY': {'type': bool, 'default': True}, - 'USE_MERCURY': {'type': bool, 'default': True}, - 'USE_GIT': {'type': bool, 'default': True}, - 'USE_CHROME': {'type': bool, 'default': True}, - 'USE_NODE': {'type': bool, 'default': True}, - 'USE_YOUTUBEDL': {'type': bool, 'default': True}, - 'USE_RIPGREP': {'type': bool, 'default': True}, - - 'CURL_BINARY': {'type': str, 'default': 'curl'}, - 'GIT_BINARY': {'type': str, 'default': 'git'}, - 'WGET_BINARY': {'type': str, 'default': 'wget'}, - 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, - 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, - 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, - 'NODE_BINARY': {'type': str, 'default': 'node'}, - 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, - 'CHROME_BINARY': {'type': str, 'default': None}, - - 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, - 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}}, - }, -} - - -########################## Backwards-Compatibility ############################# - - -# for backwards compatibility with old config files, check old/deprecated names for each key -CONFIG_ALIASES = { - alias: key - for section in CONFIG_SCHEMA.values() - for key, default in section.items() - for alias in default.get('aliases', ()) -} -USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()} - -def get_real_name(key: str) -> str: - """get the current canonical name for a given deprecated config key""" - return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip()) - - - -################################ Constants ##################################### - -PACKAGE_DIR_NAME = 'archivebox' -TEMPLATES_DIR_NAME = 'templates' - -ARCHIVE_DIR_NAME = 'archive' -SOURCES_DIR_NAME = 'sources' -LOGS_DIR_NAME = 'logs' -SQL_INDEX_FILENAME = 'index.sqlite3' -JSON_INDEX_FILENAME = 'index.json' -HTML_INDEX_FILENAME = 'index.html' -ROBOTS_TXT_FILENAME = 'robots.txt' -FAVICON_FILENAME = 'favicon.ico' -CONFIG_FILENAME = 'ArchiveBox.conf' - -DEFAULT_CLI_COLORS = { - 'reset': '\033[00;00m', - 'lightblue': '\033[01;30m', - 'lightyellow': '\033[01;33m', - 'lightred': '\033[01;35m', - 'red': '\033[01;31m', - 'green': '\033[01;32m', - 'blue': '\033[01;34m', - 'white': '\033[01;37m', - 'black': '\033[01;30m', -} -ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()} - -COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { - '00': [(0, 0, 0), (0, 0, 0)], - '30': [(0, 0, 0), (0, 0, 0)], - '31': [(255, 0, 0), (128, 0, 0)], - '32': [(0, 200, 0), (0, 128, 0)], - '33': [(255, 255, 0), (128, 128, 0)], - '34': [(0, 0, 255), (0, 0, 128)], - '35': [(255, 0, 255), (128, 0, 128)], - '36': [(0, 255, 255), (0, 128, 128)], - '37': [(255, 255, 255), (255, 255, 255)], -}) - -STATICFILE_EXTENSIONS = { - # 99.999% of the time, URLs ending in these extensions are static files - # that can be downloaded as-is, not html pages that need to be rendered - 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp', - 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai', - 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v', - 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8', - 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', - 'atom', 'rss', 'css', 'js', 'json', - 'dmg', 'iso', 'img', - 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z', - - # Less common extensions to consider adding later - # jar, swf, bin, com, exe, dll, deb - # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, - # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, - # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml - - # These are always treated as pages, not as static files, never add them: - # html, htm, shtml, xhtml, xml, aspx, php, cgi -} - -# When initializing archivebox in a new directory, we check to make sure the dir is -# actually empty so that we dont clobber someone's home directory or desktop by accident. -# These files are exceptions to the is_empty check when we're trying to init a new dir, -# as they could be from a previous archivebox version, system artifacts, dependencies, etc. -ALLOWED_IN_OUTPUT_DIR = { - '.gitignore', - 'lost+found', - '.DS_Store', - '.venv', - 'venv', - 'virtualenv', - '.virtualenv', - 'node_modules', - 'package.json', - 'package-lock.json', - 'yarn.lock', - 'static', - 'sonic', - ARCHIVE_DIR_NAME, - SOURCES_DIR_NAME, - LOGS_DIR_NAME, - SQL_INDEX_FILENAME, - f'{SQL_INDEX_FILENAME}-wal', - f'{SQL_INDEX_FILENAME}-shm', - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, - ROBOTS_TXT_FILENAME, - FAVICON_FILENAME, - CONFIG_FILENAME, - f'{CONFIG_FILENAME}.bak', - 'static_index.json', -} - -############################## Derived Config ################################## - - -DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { - 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, - 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()}, - 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, - - 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, - 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME}, - 'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])}, - - 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, - 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, - 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME}, - 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME}, - 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, - 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, - 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None - 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, - - 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, - 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']}, - - 'PYTHON_BINARY': {'default': lambda c: sys.executable}, - 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()}, - 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])}, - - 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')}, - 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)}, - - 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])}, - 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None}, - 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)}, - 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []}, - 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']}, - 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']}, - - 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])}, - 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None}, - 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False}, - 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)}, - 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, - 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, - 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, - - 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, - - 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, - 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, - - 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, - 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, - - 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, - 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned - - 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, - 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, - - 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, - 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None}, - 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']}, - 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []}, - - 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()}, - 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, - 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, - - 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, - 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, - 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, - 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, - 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, - 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, - - 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])}, - 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, - - 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, - 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, - 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, - 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, - 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, -} - - - -################################### Helpers #################################### - - -def load_config_val(key: str, - default: ConfigDefaultValue=None, - type: Optional[Type]=None, - aliases: Optional[Tuple[str, ...]]=None, - config: Optional[ConfigDict]=None, - env_vars: Optional[os._Environ]=None, - config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue: - """parse bool, int, and str key=value pairs from env""" - - - config_keys_to_check = (key, *(aliases or ())) - for key in config_keys_to_check: - if env_vars: - val = env_vars.get(key) - if val: - break - if config_file_vars: - val = config_file_vars.get(key) - if val: - break - - if type is None or val is None: - if callable(default): - assert isinstance(config, dict) - return default(config) - - return default - - elif type is bool: - if val.lower() in ('true', 'yes', '1'): - return True - elif val.lower() in ('false', 'no', '0'): - return False - else: - raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)') - - elif type is str: - if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'): - raise ValueError(f'Invalid configuration option {key}={val} (expected a string)') - return val.strip() - - elif type is int: - if not val.isdigit(): - raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)') - return int(val) - - elif type is list or type is dict: - return json.loads(val) - - raise Exception('Config values can only be str, bool, int or json') - - -def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]: - """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" - - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - config_path = Path(out_dir) / CONFIG_FILENAME - if config_path.exists(): - config_file = ConfigParser() - config_file.optionxform = str - config_file.read(config_path) - # flatten into one namespace - config_file_vars = { - key.upper(): val - for section, options in config_file.items() - for key, val in options.items() - } - # print('[i] Loaded config file', os.path.abspath(config_path)) - # print(config_file_vars) - return config_file_vars - return None - - -def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict: - """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf""" - - from .system import atomic_write - - CONFIG_HEADER = ( - """# This is the config file for your ArchiveBox collection. - # - # You can add options here manually in INI format, or automatically by running: - # archivebox config --set KEY=VALUE - # - # If you modify this file manually, make sure to update your archive after by running: - # archivebox init - # - # A list of all possible config with documentation and examples can be found here: - # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration - - """) - - out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve() - config_path = Path(out_dir) / CONFIG_FILENAME - - if not config_path.exists(): - atomic_write(config_path, CONFIG_HEADER) - - config_file = ConfigParser() - config_file.optionxform = str - config_file.read(config_path) - - with open(config_path, 'r', encoding='utf-8') as old: - atomic_write(f'{config_path}.bak', old.read()) - - find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0] - - # Set up sections in empty config file - for key, val in config.items(): - section = find_section(key) - if section in config_file: - existing_config = dict(config_file[section]) - else: - existing_config = {} - config_file[section] = {**existing_config, key: val} - - # always make sure there's a SECRET_KEY defined for Django - existing_secret_key = None - if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']: - existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY'] - - if (not existing_secret_key) or ('not a valid secret' in existing_secret_key): - from django.utils.crypto import get_random_string - chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' - random_secret_key = get_random_string(50, chars) - if 'SERVER_CONFIG' in config_file: - config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key - else: - config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key} - - with open(config_path, 'w+', encoding='utf-8') as new: - config_file.write(new) - - try: - # validate the config by attempting to re-parse it - CONFIG = load_all_config() - except BaseException: # lgtm [py/catch-base-exception] - # something went horribly wrong, rever to the previous version - with open(f'{config_path}.bak', 'r', encoding='utf-8') as old: - atomic_write(config_path, old.read()) - - raise - - if Path(f'{config_path}.bak').exists(): - os.remove(f'{config_path}.bak') - - return { - key.upper(): CONFIG.get(key.upper()) - for key in config.keys() - } - - - -def load_config(defaults: ConfigDefaultDict, - config: Optional[ConfigDict]=None, - out_dir: Optional[str]=None, - env_vars: Optional[os._Environ]=None, - config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict: - - env_vars = env_vars or os.environ - config_file_vars = config_file_vars or load_config_file(out_dir=out_dir) - - extended_config: ConfigDict = config.copy() if config else {} - for key, default in defaults.items(): - try: - extended_config[key] = load_config_val( - key, - default=default['default'], - type=default.get('type'), - aliases=default.get('aliases'), - config=extended_config, - env_vars=env_vars, - config_file_vars=config_file_vars, - ) - except KeyboardInterrupt: - raise SystemExit(0) - except Exception as e: - stderr() - stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config) - stderr(' {}: {}'.format(e.__class__.__name__, e)) - stderr() - stderr(' Check your config for mistakes and try again (your archive data is unaffected).') - stderr() - stderr(' For config documentation and examples see:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration') - stderr() - # raise - raise SystemExit(2) - - return extended_config - -# def write_config(config: ConfigDict): - -# with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f: - - -# Logging Helpers -def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] - else: - strs = [' '.join(str(a) for a in args), '\n'] - - sys.stdout.write(prefix + ''.join(strs)) - -def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if color: - strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n'] - else: - strs = [' '.join(str(a) for a in args), '\n'] - - sys.stderr.write(prefix + ''.join(strs)) - -def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None: - ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI - - if isinstance(text, str): - stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi)) - else: - stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi)) - for line in text[1:]: - stderr('{} {}'.format(prefix, line)) - - -# Dependency Metadata Helpers -def bin_version(binary: Optional[str]) -> Optional[str]: - """check the presence and return valid version line of a specified binary""" - - abspath = bin_path(binary) - if not binary or not abspath: - return None - - try: - version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode() - # take first 3 columns of first line of version info - return ' '.join(version_str.split('\n')[0].strip().split()[:3]) - except OSError: - pass - # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red') - # stderr(' Make sure it\'s installed, then confirm it\'s working by running:') - # stderr(f' {binary} --version') - # stderr() - # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:') - # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install') - return None - -def bin_path(binary: Optional[str]) -> Optional[str]: - if binary is None: - return None - - node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary - if node_modules_bin.exists(): - return str(node_modules_bin.resolve()) - - return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary - -def bin_hash(binary: Optional[str]) -> Optional[str]: - if binary is None: - return None - abs_path = bin_path(binary) - if abs_path is None or not Path(abs_path).exists(): - return None - - file_hash = md5() - with io.open(abs_path, mode='rb') as f: - for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''): - file_hash.update(chunk) - - return f'md5:{file_hash.hexdigest()}' - -def find_chrome_binary() -> Optional[str]: - """find any installed chrome binaries in the default locations""" - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - # make sure data dir finding precedence order always matches binary finding order - default_executable_paths = ( - 'chromium-browser', - 'chromium', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - 'chrome', - 'google-chrome', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - 'google-chrome-stable', - 'google-chrome-beta', - 'google-chrome-canary', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', - 'google-chrome-unstable', - 'google-chrome-dev', - ) - for name in default_executable_paths: - full_path_exists = shutil.which(name) - if full_path_exists: - return name - - return None - -def find_chrome_data_dir() -> Optional[str]: - """find any installed chrome user data directories in the default locations""" - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - # make sure data dir finding precedence order always matches binary finding order - default_profile_paths = ( - '~/.config/chromium', - '~/Library/Application Support/Chromium', - '~/AppData/Local/Chromium/User Data', - '~/.config/chrome', - '~/.config/google-chrome', - '~/Library/Application Support/Google/Chrome', - '~/AppData/Local/Google/Chrome/User Data', - '~/.config/google-chrome-stable', - '~/.config/google-chrome-beta', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Google/Chrome SxS/User Data', - '~/.config/google-chrome-unstable', - '~/.config/google-chrome-dev', - ) - for path in default_profile_paths: - full_path = Path(path).resolve() - if full_path.exists(): - return full_path - return None - -def wget_supports_compression(config): - try: - cmd = [ - config['WGET_BINARY'], - "--compression=auto", - "--help", - ] - return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode - except (FileNotFoundError, OSError): - return False - -def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: - return { - 'PACKAGE_DIR': { - 'path': (config['PACKAGE_DIR']).resolve(), - 'enabled': True, - 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(), - }, - 'TEMPLATES_DIR': { - 'path': (config['TEMPLATES_DIR']).resolve(), - 'enabled': True, - 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), - }, - 'CUSTOM_TEMPLATES_DIR': { - 'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(), - 'enabled': bool(config['CUSTOM_TEMPLATES_DIR']), - 'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(), - }, - # 'NODE_MODULES_DIR': { - # 'path': , - # 'enabled': , - # 'is_valid': (...).exists(), - # }, - } - -def get_external_locations(config: ConfigDict) -> ConfigValue: - abspath = lambda path: None if path is None else Path(path).resolve() - return { - 'CHROME_USER_DATA_DIR': { - 'path': abspath(config['CHROME_USER_DATA_DIR']), - 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'], - 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(), - }, - 'COOKIES_FILE': { - 'path': abspath(config['COOKIES_FILE']), - 'enabled': config['USE_WGET'] and config['COOKIES_FILE'], - 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(), - }, - } - -def get_data_locations(config: ConfigDict) -> ConfigValue: - return { - 'OUTPUT_DIR': { - 'path': config['OUTPUT_DIR'].resolve(), - 'enabled': True, - 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), - }, - 'SOURCES_DIR': { - 'path': config['SOURCES_DIR'].resolve(), - 'enabled': True, - 'is_valid': config['SOURCES_DIR'].exists(), - }, - 'LOGS_DIR': { - 'path': config['LOGS_DIR'].resolve(), - 'enabled': True, - 'is_valid': config['LOGS_DIR'].exists(), - }, - 'ARCHIVE_DIR': { - 'path': config['ARCHIVE_DIR'].resolve(), - 'enabled': True, - 'is_valid': config['ARCHIVE_DIR'].exists(), - }, - 'CONFIG_FILE': { - 'path': config['CONFIG_FILE'].resolve(), - 'enabled': True, - 'is_valid': config['CONFIG_FILE'].exists(), - }, - 'SQL_INDEX': { - 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(), - 'enabled': True, - 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(), - }, - } - -def get_dependency_info(config: ConfigDict) -> ConfigValue: - return { - 'ARCHIVEBOX_BINARY': { - 'path': bin_path(config['ARCHIVEBOX_BINARY']), - 'version': config['VERSION'], - 'hash': bin_hash(config['ARCHIVEBOX_BINARY']), - 'enabled': True, - 'is_valid': True, - }, - 'PYTHON_BINARY': { - 'path': bin_path(config['PYTHON_BINARY']), - 'version': config['PYTHON_VERSION'], - 'hash': bin_hash(config['PYTHON_BINARY']), - 'enabled': True, - 'is_valid': bool(config['PYTHON_VERSION']), - }, - 'DJANGO_BINARY': { - 'path': bin_path(config['DJANGO_BINARY']), - 'version': config['DJANGO_VERSION'], - 'hash': bin_hash(config['DJANGO_BINARY']), - 'enabled': True, - 'is_valid': bool(config['DJANGO_VERSION']), - }, - 'CURL_BINARY': { - 'path': bin_path(config['CURL_BINARY']), - 'version': config['CURL_VERSION'], - 'hash': bin_hash(config['CURL_BINARY']), - 'enabled': config['USE_CURL'], - 'is_valid': bool(config['CURL_VERSION']), - }, - 'WGET_BINARY': { - 'path': bin_path(config['WGET_BINARY']), - 'version': config['WGET_VERSION'], - 'hash': bin_hash(config['WGET_BINARY']), - 'enabled': config['USE_WGET'], - 'is_valid': bool(config['WGET_VERSION']), - }, - 'NODE_BINARY': { - 'path': bin_path(config['NODE_BINARY']), - 'version': config['NODE_VERSION'], - 'hash': bin_hash(config['NODE_BINARY']), - 'enabled': config['USE_NODE'], - 'is_valid': bool(config['NODE_VERSION']), - }, - 'SINGLEFILE_BINARY': { - 'path': bin_path(config['SINGLEFILE_BINARY']), - 'version': config['SINGLEFILE_VERSION'], - 'hash': bin_hash(config['SINGLEFILE_BINARY']), - 'enabled': config['USE_SINGLEFILE'], - 'is_valid': bool(config['SINGLEFILE_VERSION']), - }, - 'READABILITY_BINARY': { - 'path': bin_path(config['READABILITY_BINARY']), - 'version': config['READABILITY_VERSION'], - 'hash': bin_hash(config['READABILITY_BINARY']), - 'enabled': config['USE_READABILITY'], - 'is_valid': bool(config['READABILITY_VERSION']), - }, - 'MERCURY_BINARY': { - 'path': bin_path(config['MERCURY_BINARY']), - 'version': config['MERCURY_VERSION'], - 'hash': bin_hash(config['MERCURY_BINARY']), - 'enabled': config['USE_MERCURY'], - 'is_valid': bool(config['MERCURY_VERSION']), - }, - 'GIT_BINARY': { - 'path': bin_path(config['GIT_BINARY']), - 'version': config['GIT_VERSION'], - 'hash': bin_hash(config['GIT_BINARY']), - 'enabled': config['USE_GIT'], - 'is_valid': bool(config['GIT_VERSION']), - }, - 'YOUTUBEDL_BINARY': { - 'path': bin_path(config['YOUTUBEDL_BINARY']), - 'version': config['YOUTUBEDL_VERSION'], - 'hash': bin_hash(config['YOUTUBEDL_BINARY']), - 'enabled': config['USE_YOUTUBEDL'], - 'is_valid': bool(config['YOUTUBEDL_VERSION']), - }, - 'CHROME_BINARY': { - 'path': bin_path(config['CHROME_BINARY']), - 'version': config['CHROME_VERSION'], - 'hash': bin_hash(config['CHROME_BINARY']), - 'enabled': config['USE_CHROME'], - 'is_valid': bool(config['CHROME_VERSION']), - }, - 'RIPGREP_BINARY': { - 'path': bin_path(config['RIPGREP_BINARY']), - 'version': config['RIPGREP_VERSION'], - 'hash': bin_hash(config['RIPGREP_BINARY']), - 'enabled': config['USE_RIPGREP'], - 'is_valid': bool(config['RIPGREP_VERSION']), - }, - # TODO: add an entry for the sonic search backend? - # 'SONIC_BINARY': { - # 'path': bin_path(config['SONIC_BINARY']), - # 'version': config['SONIC_VERSION'], - # 'hash': bin_hash(config['SONIC_BINARY']), - # 'enabled': config['USE_SONIC'], - # 'is_valid': bool(config['SONIC_VERSION']), - # }, - } - -def get_chrome_info(config: ConfigDict) -> ConfigValue: - return { - 'TIMEOUT': config['TIMEOUT'], - 'RESOLUTION': config['RESOLUTION'], - 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'], - 'CHROME_BINARY': config['CHROME_BINARY'], - 'CHROME_HEADLESS': config['CHROME_HEADLESS'], - 'CHROME_SANDBOX': config['CHROME_SANDBOX'], - 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'], - 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'], - } - - -# ****************************************************************************** -# ****************************************************************************** -# ******************************** Load Config ********************************* -# ******* (compile the defaults, configs, and metadata all into CONFIG) ******** -# ****************************************************************************** -# ****************************************************************************** - - -def load_all_config(): - CONFIG: ConfigDict = {} - for section_name, section_config in CONFIG_SCHEMA.items(): - CONFIG = load_config(section_config, CONFIG) - - return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG) - -# add all final config values in CONFIG to globals in this file -CONFIG = load_all_config() -globals().update(CONFIG) -# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ... - - -# ****************************************************************************** -# ****************************************************************************** -# ****************************************************************************** -# ****************************************************************************** -# ****************************************************************************** - - - -########################### System Environment Setup ########################### - - -# Set timezone to UTC and umask to OUTPUT_PERMISSIONS -os.environ["TZ"] = 'UTC' -os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821 - -# add ./node_modules/.bin to $PATH so we can use node scripts in extractors -NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin')) -sys.path.append(NODE_BIN_PATH) - -# disable stderr "you really shouldnt disable ssl" warnings with library config -if not CONFIG['CHECK_SSL_VALIDITY']: - import urllib3 - import requests - requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - -########################### Config Validity Checkers ########################### - - -def check_system_config(config: ConfigDict=CONFIG) -> None: - ### Check system environment - if config['USER'] == 'root': - stderr('[!] ArchiveBox should never be run as root!', color='red') - stderr(' For more information, see the security overview documentation:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') - raise SystemExit(2) - - ### Check Python environment - if sys.version_info[:3] < (3, 6, 0): - stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red') - stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') - raise SystemExit(2) - - if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'): - stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red') - stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') - stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"') - stderr('') - stderr(' Confirm that it\'s fixed by opening a new shell and running:') - stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') - raise SystemExit(2) - - # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) - if config['CHROME_USER_DATA_DIR'] is not None: - if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(): - stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red') - stderr(f' {config["CHROME_USER_DATA_DIR"]}') - stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') - stderr(' For more info see:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') - if '/Default' in str(config['CHROME_USER_DATA_DIR']): - stderr() - stderr(' Try removing /Default from the end e.g.:') - stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0])) - raise SystemExit(2) - - -def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: - invalid_dependencies = [ - (name, info) for name, info in config['DEPENDENCIES'].items() - if info['enabled'] and not info['is_valid'] - ] - if invalid_dependencies and show_help: - stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow') - for dependency, info in invalid_dependencies: - stderr( - ' ! {}: {} ({})'.format( - dependency, - info['path'] or 'unable to find binary', - info['version'] or 'unable to detect version', - ) - ) - if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): - hint(('To install all packages automatically run: archivebox setup', - f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False', - ''), prefix=' ') - stderr('') - - if config['TIMEOUT'] < 5: - stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') - stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') - stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') - stderr() - stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') - stderr() - - elif config['USE_CHROME'] and config['TIMEOUT'] < 15: - stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') - stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.') - stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') - stderr() - stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') - stderr() - - if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: - stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red') - stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.') - stderr(' (Setting it somewhere over 60 seconds is recommended)') - stderr() - stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') - stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') - stderr() - -def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None: - output_dir = out_dir or config['OUTPUT_DIR'] - assert isinstance(output_dir, (str, Path)) - - archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists() - if not archive_dir_exists: - stderr('[X] No archivebox index found in the current directory.', color='red') - stderr(f' {output_dir}', color='lightyellow') - stderr() - stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI'])) - stderr(' cd path/to/your/archive/folder') - stderr(' archivebox [command]') - stderr() - stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI'])) - stderr(' archivebox init') - raise SystemExit(2) - -def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG): - output_dir = out_dir or config['OUTPUT_DIR'] - from .index.sql import list_migrations - - pending_migrations = [name for status, name in list_migrations() if not status] - - if pending_migrations: - stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') - stderr(f' {output_dir}') - stderr() - stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:') - stderr(' archivebox init') - raise SystemExit(3) - - (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True) - (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True) - - - -def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None: - check_system_config() - - output_dir = out_dir or Path(config['OUTPUT_DIR']) - - assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path) - - try: - from django.core.management import call_command - - sys.path.append(str(config['PACKAGE_DIR'])) - os.environ.setdefault('OUTPUT_DIR', str(output_dir)) - assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py' - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') - - # Check to make sure JSON extension is available in our Sqlite3 instance - try: - cursor = sqlite3.connect(':memory:').cursor() - cursor.execute('SELECT JSON(\'{"a": "b"}\')') - except sqlite3.OperationalError as exc: - stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red') - hint([ - 'Upgrade your Python version or install the extension manually:', - 'https://code.djangoproject.com/wiki/JSON1Extension' - ]) - - if in_memory_db: - # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. - # in those cases we create a temporary in-memory db and run the migrations - # immediately to get a usable in-memory-database at startup - os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") - django.setup() - call_command("migrate", interactive=False, verbosity=0) - else: - # Otherwise use default sqlite3 file-based database and initialize django - # without running migrations automatically (user runs them manually by calling init) - django.setup() - - - from django.conf import settings - - # log startup message to the error log - with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f: - command = ' '.join(sys.argv) - ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") - - - if check_db: - # Enable WAL mode in sqlite3 - from django.db import connection - with connection.cursor() as cursor: - current_mode = cursor.execute("PRAGMA journal_mode") - if current_mode != 'wal': - cursor.execute("PRAGMA journal_mode=wal;") - - # Create cache table in DB if needed - try: - from django.core.cache import cache - cache.get('test', None) - except django.db.utils.OperationalError: - call_command("createcachetable", verbosity=0) - - - # if archivebox gets imported multiple times, we have to close - # the sqlite3 whenever we init from scratch to avoid multiple threads - # sharing the same connection by accident - from django.db import connections - for conn in connections.all(): - conn.close_if_unusable_or_obsolete() - - sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME - assert sql_index_path.exists(), ( - f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)') - - except KeyboardInterrupt: - raise SystemExit(2) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py new file mode 100644 index 0000000000..5f4f9032a5 --- /dev/null +++ b/archivebox/config/__init__.py @@ -0,0 +1,105 @@ +""" +ArchiveBox config exports. + +This module provides backwards-compatible config exports for extractors +and other modules that expect to import config values directly. +""" + +__package__ = "archivebox.config" +__order__ = 200 + +from .paths import ( + PACKAGE_DIR, + DATA_DIR, + ARCHIVE_DIR, +) +from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa +from .version import VERSION # noqa + + +############################################################################### +# Config value exports for extractors +# These provide backwards compatibility with extractors that import from ..config +############################################################################### + + +def _get_config(): + """Lazy import to avoid circular imports.""" + from .common import ARCHIVING_CONFIG, STORAGE_CONFIG + + return ARCHIVING_CONFIG, STORAGE_CONFIG + + +# Direct exports (evaluated at import time for backwards compat) +# These are recalculated each time the module attribute is accessed + + +def __getattr__(name: str): + """ + Module-level __getattr__ for lazy config loading. + + Only provides backwards compatibility for GENERIC/SHARED config. + Plugin-specific config (binaries, args, toggles) should come from plugin config.json files. + """ + + # Generic timeout settings (used by multiple plugins) + if name == "TIMEOUT": + cfg, _ = _get_config() + return cfg.TIMEOUT + + # Generic SSL/Security settings (used by multiple plugins) + if name == "CHECK_SSL_VALIDITY": + cfg, _ = _get_config() + return cfg.CHECK_SSL_VALIDITY + + # Generic storage settings (used by multiple plugins) + if name == "RESTRICT_FILE_NAMES": + _, storage = _get_config() + return storage.RESTRICT_FILE_NAMES + + # Generic user agent / cookies (used by multiple plugins) + if name == "COOKIES_FILE": + cfg, _ = _get_config() + return cfg.COOKIES_FILE + if name == "USER_AGENT": + cfg, _ = _get_config() + return cfg.USER_AGENT + + # Generic resolution settings (used by multiple plugins) + if name == "RESOLUTION": + cfg, _ = _get_config() + return cfg.RESOLUTION + + # Allowlist/Denylist patterns (compiled regexes) + if name == "SAVE_ALLOWLIST_PTN": + cfg, _ = _get_config() + return cfg.SAVE_ALLOWLIST_PTNS + if name == "SAVE_DENYLIST_PTN": + cfg, _ = _get_config() + return cfg.SAVE_DENYLIST_PTNS + + raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'") + + +# Re-export common config classes for direct imports +def get_CONFIG(): + """Get all config sections as a dict.""" + from .common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + from .ldap import LDAP_CONFIG + + return { + "SHELL_CONFIG": SHELL_CONFIG, + "STORAGE_CONFIG": STORAGE_CONFIG, + "GENERAL_CONFIG": GENERAL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG, + "LDAP_CONFIG": LDAP_CONFIG, + } diff --git a/archivebox/config/collection.py b/archivebox/config/collection.py new file mode 100644 index 0000000000..215bafc4b9 --- /dev/null +++ b/archivebox/config/collection.py @@ -0,0 +1,305 @@ +__package__ = "archivebox.config" + +import os +import json +from typing import Any + +from pathlib import Path +from configparser import ConfigParser + +from benedict import benedict + + +from archivebox.config.constants import CONSTANTS + +from archivebox.misc.logging import stderr + + +class CaseConfigParser(ConfigParser): + def optionxform(self, optionstr: str) -> str: + return optionstr + + +def get_real_name(key: str) -> str: + """get the up-to-date canonical name for a given old alias or current key""" + # Config aliases are no longer used with the simplified config system + # Just return the key as-is since we no longer have a complex alias mapping + return key + + +def load_config_val( + key: str, + default: Any = None, + type: type | None = None, + aliases: tuple[str, ...] | None = None, + config: benedict | None = None, + env_vars: os._Environ | None = None, + config_file_vars: dict[str, str] | None = None, +) -> Any: + """parse bool, int, and str key=value pairs from env""" + + assert isinstance(config, dict) + + is_read_only = type is None + if is_read_only: + if callable(default): + return default(config) + return default + + # get value from environment variables or config files + config_keys_to_check = (key, *(aliases or ())) + val = None + for key in config_keys_to_check: + if env_vars: + val = env_vars.get(key) + if val: + break + + if config_file_vars: + val = config_file_vars.get(key) + if val: + break + + is_unset = val is None + if is_unset: + if callable(default): + return default(config) + return default + + assert isinstance(val, str) + + # calculate value based on expected type + BOOL_TRUEIES = ("true", "yes", "1") + BOOL_FALSEIES = ("false", "no", "0") + + if type is bool: + if val.lower() in BOOL_TRUEIES: + return True + elif val.lower() in BOOL_FALSEIES: + return False + else: + raise ValueError(f"Invalid configuration option {key}={val} (expected a boolean: True/False)") + + elif type is str: + if val.lower() in (*BOOL_TRUEIES, *BOOL_FALSEIES): + raise ValueError(f"Invalid configuration option {key}={val} (expected a string, but value looks like a boolean)") + return val.strip() + + elif type is int: + if not val.strip().isdigit(): + raise ValueError(f"Invalid configuration option {key}={val} (expected an integer)") + return int(val.strip()) + + elif type is list or type is dict: + return json.loads(val) + + elif type is Path: + return Path(val) + + raise Exception("Config values can only be str, bool, int, or json") + + +def load_config_file() -> benedict | None: + """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" + + config_path = CONSTANTS.CONFIG_FILE + if os.access(config_path, os.R_OK): + config_file = CaseConfigParser() + config_file.read(config_path) + # flatten into one namespace + config_file_vars = benedict({key.upper(): val for section, options in config_file.items() for key, val in options.items()}) + # print('[i] Loaded config file', os.path.abspath(config_path)) + # print(config_file_vars) + return config_file_vars + return None + + +class PluginConfigSection: + """Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf""" + + toml_section_header = "PLUGINS" + + def __init__(self, key: str): + self._key = key + + def __getattr__(self, name: str) -> Any: + # Allow hasattr checks to pass for the key + if name == self._key: + return None + raise AttributeError(f"PluginConfigSection has no attribute '{name}'") + + def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs): + """No-op update since plugins read config dynamically via get_config().""" + pass + + +def section_for_key(key: str) -> Any: + """Find the config section containing a given key.""" + from archivebox.config.common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + + # First check core config sections + for section in [ + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ]: + if hasattr(section, key): + return section + + # Check if this is a plugin config key + from archivebox.hooks import discover_plugin_configs + + plugin_configs = discover_plugin_configs() + for plugin_name, schema in plugin_configs.items(): + if "properties" in schema and key in schema["properties"]: + # All plugin config goes to [PLUGINS] section + return PluginConfigSection(key) + + raise ValueError(f"No config section found for key: {key}") + + +def write_config_file(config: dict[str, str]) -> benedict: + """load the ini-formatted config file from DATA_DIR/Archivebox.conf""" + + from archivebox.misc.system import atomic_write + + CONFIG_HEADER = """# This is the config file for your ArchiveBox collection. + # + # You can add options here manually in INI format, or automatically by running: + # archivebox config --set KEY=VALUE + # + # If you modify this file manually, make sure to update your archive after by running: + # archivebox init + # + # A list of all possible config with documentation and examples can be found here: + # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration + + """ + + config_path = CONSTANTS.CONFIG_FILE + + if not os.access(config_path, os.F_OK): + atomic_write(config_path, CONFIG_HEADER) + + config_file = CaseConfigParser() + config_file.read(config_path) + + with open(config_path, encoding="utf-8") as old: + atomic_write(f"{config_path}.bak", old.read()) + + # Set up sections in empty config file + for key, val in config.items(): + section = section_for_key(key) + assert section is not None + + if not hasattr(section, "toml_section_header"): + raise ValueError(f"{key} is read-only (defined in {type(section).__module__}.{type(section).__name__}). Refusing to set.") + + section_name = section.toml_section_header + + if section_name in config_file: + existing_config = dict(config_file[section_name]) + else: + existing_config = {} + + config_file[section_name] = benedict({**existing_config, key: val}) + section.update_in_place(warn=False, persist=False, **{key: val}) + + with open(config_path, "w+", encoding="utf-8") as new: + config_file.write(new) + + updated_config = {} + try: + # validate the updated_config by attempting to re-parse it + from archivebox.config.configset import get_flat_config + + updated_config = {**load_all_config(), **get_flat_config()} + except BaseException: # lgtm [py/catch-base-exception] + # something went horribly wrong, revert to the previous version + with open(f"{config_path}.bak", encoding="utf-8") as old: + atomic_write(config_path, old.read()) + + raise + + if os.access(f"{config_path}.bak", os.F_OK): + os.remove(f"{config_path}.bak") + + return benedict({key.upper(): updated_config.get(key.upper()) for key in config.keys()}) + + +def load_config( + defaults: dict[str, Any], + config: benedict | None = None, + out_dir: str | None = None, + env_vars: os._Environ | None = None, + config_file_vars: dict[str, str] | None = None, +) -> benedict: + + env_vars = env_vars or os.environ + config_file_vars = config_file_vars or load_config_file() + + extended_config = benedict(config.copy() if config else {}) + for key, default in defaults.items(): + try: + # print('LOADING CONFIG KEY:', key, 'DEFAULT=', default) + extended_config[key] = load_config_val( + key, + default=default["default"], + type=default.get("type"), + aliases=default.get("aliases"), + config=extended_config, + env_vars=env_vars, + config_file_vars=config_file_vars, + ) + except KeyboardInterrupt: + raise SystemExit(0) + except Exception as e: + stderr() + stderr(f"[X] Error while loading configuration value: {key}", color="red", config=extended_config) + stderr(f" {e.__class__.__name__}: {e}") + stderr() + stderr(" Check your config for mistakes and try again (your archive data is unaffected).") + stderr() + stderr(" For config documentation and examples see:") + stderr(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration") + stderr() + # raise + # raise SystemExit(2) + + return benedict(extended_config) + + +def load_all_config(): + """Load all config sections and return as a flat dict.""" + from archivebox.config.common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + + flat_config = benedict() + + for config_section in [ + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ]: + flat_config.update(dict(config_section)) + + return flat_config diff --git a/archivebox/config/common.py b/archivebox/config/common.py new file mode 100644 index 0000000000..2be64d9bf0 --- /dev/null +++ b/archivebox/config/common.py @@ -0,0 +1,315 @@ +__package__ = "archivebox.config" + +import re +import secrets +import sys +import shutil +from typing import ClassVar +from pathlib import Path + +from rich.console import Console +from pydantic import Field, field_validator + +from archivebox.config.configset import BaseConfigSet + +from .constants import CONSTANTS +from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION +from .permissions import IN_DOCKER + +###################### Config ########################## + +_STDOUT_CONSOLE = Console() +_STDERR_CONSOLE = Console(stderr=True) + + +def rprint(*args, file=None, **kwargs): + console = _STDERR_CONSOLE if file is sys.stderr else _STDOUT_CONSOLE + console.print(*args, **kwargs) + + +class ShellConfig(BaseConfigSet): + toml_section_header: str = "SHELL_CONFIG" + + DEBUG: bool = Field(default="--debug" in sys.argv) + + IS_TTY: bool = Field(default=sys.stdout.isatty()) + USE_COLOR: bool = Field(default=sys.stdout.isatty()) + SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty()) + + IN_DOCKER: bool = Field(default=IN_DOCKER) + IN_QEMU: bool = Field(default=False) + + ANSI: dict[str, str] = Field( + default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS, + ) + + @property + def TERM_WIDTH(self) -> int: + if not self.IS_TTY: + return 200 + return shutil.get_terminal_size((140, 10)).columns + + @property + def COMMIT_HASH(self) -> str | None: + return get_COMMIT_HASH() + + @property + def BUILD_TIME(self) -> str: + return get_BUILD_TIME() + + +SHELL_CONFIG = ShellConfig() + + +class StorageConfig(BaseConfigSet): + toml_section_header: str = "STORAGE_CONFIG" + + # TMP_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be a short path due to unix path length restrictions for socket files (<100 chars) + # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets + TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR) + + # LIB_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be able to contain executable binaries (up to 5GB size) + # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow + LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR) + + # LIB_BIN_DIR is where all installed binaries are symlinked for easy PATH management + # Derived from LIB_DIR / 'bin', should be prepended to PATH for all hook executions + LIB_BIN_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_BIN_DIR) + + # CUSTOM_TEMPLATES_DIR allows users to override default templates + # defaults to DATA_DIR / 'user_templates' but can be configured + CUSTOM_TEMPLATES_DIR: Path = Field(default=CONSTANTS.CUSTOM_TEMPLATES_DIR) + + OUTPUT_PERMISSIONS: str = Field(default="644") + RESTRICT_FILE_NAMES: str = Field(default="windows") + ENFORCE_ATOMIC_WRITES: bool = Field(default=True) + + # not supposed to be user settable: + DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS + + +STORAGE_CONFIG = StorageConfig() + + +class GeneralConfig(BaseConfigSet): + toml_section_header: str = "GENERAL_CONFIG" + + TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]") + + +GENERAL_CONFIG = GeneralConfig() + + +class ServerConfig(BaseConfigSet): + toml_section_header: str = "SERVER_CONFIG" + + SERVER_SECURITY_MODES: ClassVar[tuple[str, ...]] = ( + "safe-subdomains-fullreplay", + "safe-onedomain-nojsreplay", + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50))) + BIND_ADDR: str = Field(default="127.0.0.1:8000") + LISTEN_HOST: str = Field(default="archivebox.localhost:8000") + ADMIN_BASE_URL: str = Field(default="") + ARCHIVE_BASE_URL: str = Field(default="") + ALLOWED_HOSTS: str = Field(default="*") + CSRF_TRUSTED_ORIGINS: str = Field(default="http://admin.archivebox.localhost:8000") + SERVER_SECURITY_MODE: str = Field(default="safe-subdomains-fullreplay") + + SNAPSHOTS_PER_PAGE: int = Field(default=40) + PREVIEW_ORIGINALS: bool = Field(default=True) + FOOTER_INFO: str = Field( + default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.", + ) + # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant + + PUBLIC_INDEX: bool = Field(default=True) + PUBLIC_SNAPSHOTS: bool = Field(default=True) + PUBLIC_ADD_VIEW: bool = Field(default=False) + + ADMIN_USERNAME: str | None = Field(default=None) + ADMIN_PASSWORD: str | None = Field(default=None) + + REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User") + REVERSE_PROXY_WHITELIST: str = Field(default="") + LOGOUT_REDIRECT_URL: str = Field(default="/") + + @field_validator("SERVER_SECURITY_MODE", mode="after") + def validate_server_security_mode(cls, v: str) -> str: + mode = (v or "").strip().lower() + if mode not in cls.SERVER_SECURITY_MODES: + raise ValueError(f"SERVER_SECURITY_MODE must be one of: {', '.join(cls.SERVER_SECURITY_MODES)}") + return mode + + @property + def USES_SUBDOMAIN_ROUTING(self) -> bool: + return self.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay" + + @property + def ENABLES_FULL_JS_REPLAY(self) -> bool: + return self.SERVER_SECURITY_MODE in ( + "safe-subdomains-fullreplay", + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + @property + def CONTROL_PLANE_ENABLED(self) -> bool: + return self.SERVER_SECURITY_MODE != "unsafe-onedomain-noadmin" + + @property + def BLOCK_UNSAFE_METHODS(self) -> bool: + return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin" + + @property + def SHOULD_NEUTER_RISKY_REPLAY(self) -> bool: + return self.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay" + + @property + def IS_UNSAFE_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin" + + @property + def IS_DANGEROUS_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE == "danger-onedomain-fullreplay" + + @property + def IS_LOWER_SECURITY_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE in ( + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + +SERVER_CONFIG = ServerConfig() + + +def _print_server_security_mode_warning() -> None: + if not SERVER_CONFIG.IS_LOWER_SECURITY_MODE: + return + + rprint( + f"[yellow][!] WARNING: ArchiveBox is running with SERVER_SECURITY_MODE={SERVER_CONFIG.SERVER_SECURITY_MODE}[/yellow]", + file=sys.stderr, + ) + rprint( + "[yellow] Archived pages may share an origin with privileged app routes in this mode.[/yellow]", + file=sys.stderr, + ) + rprint( + "[yellow] To switch to the safer isolated setup:[/yellow]", + file=sys.stderr, + ) + rprint( + "[yellow] 1. Set SERVER_SECURITY_MODE=safe-subdomains-fullreplay[/yellow]", + file=sys.stderr, + ) + rprint( + "[yellow] 2. Point *.archivebox.localhost (or your chosen base domain) at this server[/yellow]", + file=sys.stderr, + ) + rprint( + "[yellow] 3. Configure wildcard DNS/TLS or your reverse proxy so admin., web., api., and snapshot subdomains resolve[/yellow]", + file=sys.stderr, + ) + + +_print_server_security_mode_warning() + + +class ArchivingConfig(BaseConfigSet): + toml_section_header: str = "ARCHIVING_CONFIG" + + ONLY_NEW: bool = Field(default=True) + OVERWRITE: bool = Field(default=False) + + TIMEOUT: int = Field(default=60) + MAX_URL_ATTEMPTS: int = Field(default=50) + + RESOLUTION: str = Field(default="1440,2000") + CHECK_SSL_VALIDITY: bool = Field(default=True) + USER_AGENT: str = Field( + default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)", + ) + COOKIES_FILE: Path | None = Field(default=None) + + URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST") + URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST") + + SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods + SAVE_DENYLIST: dict[str, list[str]] = Field(default={}) + + DEFAULT_PERSONA: str = Field(default="Default") + + def warn_if_invalid(self) -> None: + if int(self.TIMEOUT) < 5: + rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr) + rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr) + rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr) + rprint(file=sys.stderr) + rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr) + rprint(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr) + rprint(file=sys.stderr) + + @field_validator("CHECK_SSL_VALIDITY", mode="after") + def validate_check_ssl_validity(cls, v): + """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests""" + if not v: + import urllib3 + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + return v + + @property + def URL_ALLOWLIST_PTN(self) -> re.Pattern | None: + return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None + + @property + def URL_DENYLIST_PTN(self) -> re.Pattern: + return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) + + @property + def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]: + return ( + { + # regexp: methods list + re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val + for key, val in self.SAVE_ALLOWLIST.items() + } + if self.SAVE_ALLOWLIST + else {} + ) + + @property + def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]: + return ( + { + # regexp: methods list + re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val + for key, val in self.SAVE_DENYLIST.items() + } + if self.SAVE_DENYLIST + else {} + ) + + +ARCHIVING_CONFIG = ArchivingConfig() +ARCHIVING_CONFIG.warn_if_invalid() + + +class SearchBackendConfig(BaseConfigSet): + toml_section_header: str = "SEARCH_BACKEND_CONFIG" + + USE_INDEXING_BACKEND: bool = Field(default=True) + USE_SEARCHING_BACKEND: bool = Field(default=True) + + SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep") + SEARCH_PROCESS_HTML: bool = Field(default=True) + + +SEARCH_BACKEND_CONFIG = SearchBackendConfig() diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py new file mode 100644 index 0000000000..da055632de --- /dev/null +++ b/archivebox/config/configset.py @@ -0,0 +1,408 @@ +""" +Simplified config system for ArchiveBox. + +This replaces the complex abx_spec_config/base_configset.py with a simpler +approach that still supports environment variables, config files, and +per-object overrides. +""" + +__package__ = "archivebox.config" + +import os +import json +from pathlib import Path +from typing import Any +from configparser import ConfigParser + +from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict + + +class CaseConfigParser(ConfigParser): + def optionxform(self, optionstr: str) -> str: + return optionstr + + +class IniConfigSettingsSource(PydanticBaseSettingsSource): + """ + Custom settings source that reads from ArchiveBox.conf (INI format). + Flattens all sections into a single namespace. + """ + + def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]: + config_vals = self._load_config_file() + field_value = config_vals.get(field_name.upper()) + return field_value, field_name, False + + def __call__(self) -> dict[str, Any]: + return self._load_config_file() + + def _load_config_file(self) -> dict[str, Any]: + try: + from archivebox.config.constants import CONSTANTS + + config_path = CONSTANTS.CONFIG_FILE + except ImportError: + return {} + + if not config_path.exists(): + return {} + + parser = CaseConfigParser() + parser.read(config_path) + + # Flatten all sections into single namespace (ignore section headers) + return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} + + +class BaseConfigSet(BaseSettings): + """ + Base class for config sections. + + Automatically loads values from (highest to lowest priority): + 1. Environment variables + 2. ArchiveBox.conf file (INI format, flattened) + 3. Default values + + Subclasses define fields with defaults and types: + + class ShellConfig(BaseConfigSet): + DEBUG: bool = Field(default=False) + USE_COLOR: bool = Field(default=True) + """ + + model_config = SettingsConfigDict( + env_prefix="", + extra="ignore", + validate_default=True, + ) + + @classmethod + def settings_customise_sources( + cls, + settings_cls: type[BaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource, + ) -> tuple[PydanticBaseSettingsSource, ...]: + """ + Define the order of settings sources (first = highest priority). + """ + return ( + init_settings, # 1. Passed to __init__ + env_settings, # 2. Environment variables + IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file + # dotenv_settings, # Skip .env files + # file_secret_settings, # Skip secrets files + ) + + @classmethod + def load_from_file(cls, config_path: Path) -> dict[str, str]: + """Load config values from INI file.""" + if not config_path.exists(): + return {} + + parser = CaseConfigParser() + parser.read(config_path) + + # Flatten all sections into single namespace + return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} + + def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None: + """ + Update config values in place. + + This allows runtime updates to config without reloading. + """ + for key, value in kwargs.items(): + if hasattr(self, key): + # Use object.__setattr__ to bypass pydantic's frozen model + object.__setattr__(self, key, value) + + +def get_config( + defaults: dict | None = None, + persona: Any = None, + user: Any = None, + crawl: Any = None, + snapshot: Any = None, + archiveresult: Any = None, + machine: Any = None, +) -> dict[str, Any]: + """ + Get merged config from all sources. + + Priority (highest to lowest): + 1. Per-snapshot config (snapshot.config JSON field) + 2. Per-crawl config (crawl.config JSON field) + 3. Per-user config (user.config JSON field) + 4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.) + 5. Environment variables + 6. Config file (ArchiveBox.conf) + 7. Plugin schema defaults (config.json) + 8. Core config defaults + + Args: + defaults: Default values to start with + persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR) + user: User object with config JSON field + crawl: Crawl object with config JSON field + snapshot: Snapshot object with config JSON field + archiveresult: ArchiveResult object (auto-fetches snapshot) + machine: Unused legacy argument kept for call compatibility + + Note: Objects are auto-fetched from relationships if not provided: + - snapshot auto-fetched from archiveresult.snapshot + - crawl auto-fetched from snapshot.crawl + - user auto-fetched from crawl.created_by + + Returns: + Merged config dict + """ + # Auto-fetch related objects from relationships + if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"): + snapshot = archiveresult.snapshot + + if crawl is None and snapshot and hasattr(snapshot, "crawl"): + crawl = snapshot.crawl + + if user is None and crawl and hasattr(crawl, "created_by"): + user = crawl.created_by + + if persona is None and crawl is not None: + from archivebox.personas.models import Persona + + persona_id = getattr(crawl, "persona_id", None) + if persona_id: + persona = Persona.objects.filter(id=persona_id).first() + if persona is None: + raise Persona.DoesNotExist(f"Crawl {getattr(crawl, 'id', None)} references missing Persona {persona_id}") + + if persona is None: + crawl_config = getattr(crawl, "config", None) or {} + default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip() + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default") + persona.ensure_dirs() + from archivebox.config.constants import CONSTANTS + from archivebox.config.common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + + # Start with defaults + config = dict(defaults or {}) + + # Add plugin config defaults from JSONSchema config.json files + try: + from archivebox.hooks import get_config_defaults_from_plugins + + plugin_defaults = get_config_defaults_from_plugins() + config.update(plugin_defaults) + except ImportError: + pass # hooks not available yet during early startup + + # Add all core config sections + config.update(dict(SHELL_CONFIG)) + config.update(dict(STORAGE_CONFIG)) + config.update(dict(GENERAL_CONFIG)) + config.update(dict(SERVER_CONFIG)) + config.update(dict(ARCHIVING_CONFIG)) + config.update(dict(SEARCH_BACKEND_CONFIG)) + + # Load from archivebox.config.file + config_file = CONSTANTS.CONFIG_FILE + if config_file.exists(): + file_config = BaseConfigSet.load_from_file(config_file) + config.update(file_config) + + # Override with environment variables (for keys that exist in config) + for key in config: + env_val = os.environ.get(key) + if env_val is not None: + config[key] = _parse_env_value(env_val, config.get(key)) + + # Also add NEW environment variables (not yet in config) + # This is important for worker subprocesses that receive config via Process.env + for key, value in os.environ.items(): + if key.isupper() and key not in config: # Only uppercase keys (config convention) + config[key] = _parse_env_value(value, None) + + # Also check plugin config aliases in environment + try: + from archivebox.hooks import discover_plugin_configs + + plugin_configs = discover_plugin_configs() + for plugin_name, schema in plugin_configs.items(): + for key, prop_schema in schema.get("properties", {}).items(): + # Check x-aliases + for alias in prop_schema.get("x-aliases", []): + if alias in os.environ and key not in os.environ: + config[key] = _parse_env_value(os.environ[alias], config.get(key)) + break + # Check x-fallback + fallback = prop_schema.get("x-fallback") + if fallback and fallback in config and key not in config: + config[key] = config[fallback] + except ImportError: + pass + + # Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR) + if persona and hasattr(persona, "get_derived_config"): + config.update(persona.get_derived_config()) + + # Apply user config overrides + if user and hasattr(user, "config") and user.config: + config.update(user.config) + + # Apply crawl config overrides + if crawl and hasattr(crawl, "config") and crawl.config: + config.update(crawl.config) + + # Add crawl path aliases for hooks that need shared crawl state. + if crawl and hasattr(crawl, "output_dir"): + config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir) + config["CRAWL_DIR"] = str(crawl.output_dir) + + # Apply snapshot config overrides (highest priority) + if snapshot and hasattr(snapshot, "config") and snapshot.config: + config.update(snapshot.config) + + if snapshot and hasattr(snapshot, "output_dir"): + config["SNAP_DIR"] = str(snapshot.output_dir) + + # Normalize all aliases to canonical names (after all sources merged) + # This handles aliases that came from user/crawl/snapshot configs, not just env + try: + from archivebox.hooks import discover_plugin_configs + + plugin_configs = discover_plugin_configs() + aliases_to_normalize = {} # {alias_key: canonical_key} + + # Build alias mapping from all plugin schemas + for plugin_name, schema in plugin_configs.items(): + for canonical_key, prop_schema in schema.get("properties", {}).items(): + for alias in prop_schema.get("x-aliases", []): + aliases_to_normalize[alias] = canonical_key + + # Normalize: copy alias values to canonical keys (aliases take precedence) + for alias_key, canonical_key in aliases_to_normalize.items(): + if alias_key in config: + # Alias exists - copy to canonical key (overwriting any default) + config[canonical_key] = config[alias_key] + # Remove alias from config to keep it clean + del config[alias_key] + except ImportError: + pass + + if not config.get("DATA_DIR"): + config["DATA_DIR"] = str(CONSTANTS.DATA_DIR) + config["ABX_RUNTIME"] = "archivebox" + + return config + + +def get_flat_config() -> dict[str, Any]: + """ + Get a flat dictionary of all config values. + + Replaces abx.pm.hook.get_FLAT_CONFIG() + """ + return get_config() + + +def get_all_configs() -> dict[str, BaseConfigSet]: + """ + Get all config section objects as a dictionary. + + Replaces abx.pm.hook.get_CONFIGS() + """ + from archivebox.config.common import ( + SHELL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + + return { + "SHELL_CONFIG": SHELL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCH_BACKEND_CONFIG": SEARCH_BACKEND_CONFIG, + } + + +def _parse_env_value(value: str, default: Any = None) -> Any: + """Parse an environment variable value based on expected type.""" + if default is None: + # Try to guess the type + if value.lower() in ("true", "false", "yes", "no", "1", "0"): + return value.lower() in ("true", "yes", "1") + try: + return int(value) + except ValueError: + pass + try: + return json.loads(value) + except (json.JSONDecodeError, ValueError): + pass + return value + + # Parse based on default's type + if isinstance(default, bool): + return value.lower() in ("true", "yes", "1") + elif isinstance(default, int): + return int(value) + elif isinstance(default, float): + return float(value) + elif isinstance(default, (list, dict)): + return json.loads(value) + elif isinstance(default, Path): + return Path(value) + else: + return value + + +# Default worker concurrency settings +DEFAULT_WORKER_CONCURRENCY = { + "crawl": 2, + "snapshot": 3, + "wget": 2, + "ytdlp": 2, + "screenshot": 3, + "singlefile": 2, + "title": 5, + "favicon": 5, + "headers": 5, + "archivedotorg": 2, + "readability": 3, + "mercury": 3, + "git": 2, + "pdf": 2, + "dom": 3, +} + + +def get_worker_concurrency() -> dict[str, int]: + """ + Get worker concurrency settings. + + Can be configured via WORKER_CONCURRENCY env var as JSON dict. + """ + config = get_config() + + # Start with defaults + concurrency = DEFAULT_WORKER_CONCURRENCY.copy() + + # Override with config + if "WORKER_CONCURRENCY" in config: + custom = config["WORKER_CONCURRENCY"] + if isinstance(custom, str): + custom = json.loads(custom) + concurrency.update(custom) + + return concurrency diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py new file mode 100644 index 0000000000..40fc11d9aa --- /dev/null +++ b/archivebox/config/constants.py @@ -0,0 +1,284 @@ +""" +Constants are for things that never change at runtime. +(but they can change from run-to-run or machine-to-machine) + +DATA_DIR will never change at runtime, but you can run +archivebox from inside a different DATA_DIR on the same machine. + +This is loaded very early in the archivebox startup flow, so nothing in this file +or imported from this file should import anything from archivebox.config.common, +django, other INSTALLED_APPS, or anything else that is not in a standard library. +""" + +__package__ = "archivebox.config" + +import re +import sys + +from pathlib import Path + +from benedict import benedict + +from archivebox.misc.logging import DEFAULT_CLI_COLORS + +from .paths import ( + PACKAGE_DIR, + DATA_DIR, + ARCHIVE_DIR, + get_collection_id, + get_machine_id, + get_machine_type, +) +from .permissions import ( + IS_ROOT, + IN_DOCKER, + RUNNING_AS_UID, + RUNNING_AS_GID, + DEFAULT_PUID, + DEFAULT_PGID, + ARCHIVEBOX_USER, + ARCHIVEBOX_GROUP, +) +from .version import detect_installed_version + +###################### Config ########################## + + +class ConstantsDict: + PACKAGE_DIR: Path = PACKAGE_DIR + DATA_DIR: Path = DATA_DIR + ARCHIVE_DIR: Path = ARCHIVE_DIR + + MACHINE_TYPE: str = get_machine_type() + MACHINE_ID: str = get_machine_id() + COLLECTION_ID: str = get_collection_id(DATA_DIR) + + # Host system + VERSION: str = detect_installed_version(PACKAGE_DIR) + IN_DOCKER: bool = IN_DOCKER + + # Permissions + IS_ROOT: bool = IS_ROOT + ARCHIVEBOX_USER: int = ARCHIVEBOX_USER + ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP + RUNNING_AS_UID: int = RUNNING_AS_UID + RUNNING_AS_GID: int = RUNNING_AS_GID + DEFAULT_PUID: int = DEFAULT_PUID + DEFAULT_PGID: int = DEFAULT_PGID + IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix + + # Source code dirs + PACKAGE_DIR_NAME: str = PACKAGE_DIR.name + TEMPLATES_DIR_NAME: str = "templates" + TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME + STATIC_DIR_NAME: str = "static" + STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME + + # Data dirs + ARCHIVE_DIR_NAME: str = "archive" + SOURCES_DIR_NAME: str = "sources" + PERSONAS_DIR_NAME: str = "personas" + CACHE_DIR_NAME: str = "cache" + LOGS_DIR_NAME: str = "logs" + CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins" + CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates" + ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME + SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME + PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME + LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME + CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME + CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME + USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME + + # Data dir files + CONFIG_FILENAME: str = "ArchiveBox.conf" + SQL_INDEX_FILENAME: str = "index.sqlite3" + CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME + DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME + + JSON_INDEX_FILENAME: str = "index.json" + JSONL_INDEX_FILENAME: str = "index.jsonl" + HTML_INDEX_FILENAME: str = "index.html" + ROBOTS_TXT_FILENAME: str = "robots.txt" + FAVICON_FILENAME: str = "favicon.ico" + + # Runtime dirs + TMP_DIR_NAME: str = "tmp" + DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + + LIB_DIR_NAME: str = "lib" + DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker + DEFAULT_LIB_BIN_DIR: Path = DEFAULT_LIB_DIR / "bin" # ./data/lib/arm64-linux-docker/bin + + # Config constants + TIMEZONE: str = "UTC" + DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS + DISABLED_CLI_COLORS: dict[str, str] = benedict({k: "" for k in DEFAULT_CLI_COLORS}) + + # Hard safety limits (seconds) + MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + + ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE + + STATICFILE_EXTENSIONS: frozenset[str] = frozenset( + ( + # 99.999% of the time, URLs ending in these extensions are static files + # that can be downloaded as-is, not html pages that need to be rendered + "gif", + "jpeg", + "jpg", + "png", + "tif", + "tiff", + "wbmp", + "ico", + "jng", + "bmp", + "svg", + "svgz", + "webp", + "ps", + "eps", + "ai", + "mp3", + "mp4", + "m4a", + "mpeg", + "mpg", + "mkv", + "mov", + "webm", + "m4v", + "flv", + "wmv", + "avi", + "ogg", + "ts", + "m3u8", + "pdf", + "txt", + "rtf", + "rtfd", + "doc", + "docx", + "ppt", + "pptx", + "xls", + "xlsx", + "atom", + "rss", + "css", + "js", + "json", + "dmg", + "iso", + "img", + "rar", + "war", + "hqx", + "zip", + "gz", + "bz2", + "7z", + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + # These are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi + ), + ) + + PIP_RELATED_NAMES: frozenset[str] = frozenset( + ( + ".venv", + "venv", + "virtualenv", + ".virtualenv", + ), + ) + NPM_RELATED_NAMES: frozenset[str] = frozenset( + ( + "node_modules", + "package.json", + "package-lock.json", + "yarn.lock", + ), + ) + + # When initializing archivebox in a new directory, we check to make sure the dir is + # actually empty so that we dont clobber someone's home directory or desktop by accident. + # These files are exceptions to the is_empty check when we're trying to init a new dir, + # as they could be from a previous archivebox version, system artifacts, dependencies, etc. + ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset( + ( + *PIP_RELATED_NAMES, + *NPM_RELATED_NAMES, + ### Dirs: + ARCHIVE_DIR_NAME, + SOURCES_DIR_NAME, + LOGS_DIR_NAME, + CACHE_DIR_NAME, + LIB_DIR_NAME, + TMP_DIR_NAME, + PERSONAS_DIR_NAME, + CUSTOM_TEMPLATES_DIR_NAME, + CUSTOM_PLUGINS_DIR_NAME, + "invalid", + "users", + "machine", + # Backwards compatibility with old directory names + "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') + "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') + "static", # created by old static exports None: + from rich.panel import Panel + + global DJANGO_SET_UP + + if DJANGO_SET_UP: + # raise Exception('django is already set up!') + # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes + return + + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission + + # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user + if IS_ROOT and ARCHIVEBOX_USER != 0: + with SudoPermission(uid=0): + # running as root is a special case where it's ok to be a bit slower + # make sure data dir is always owned by the correct user + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') + + # Suppress the "database access during app initialization" warning + # This warning can be triggered during django.setup() but is safe to ignore + # since we're doing intentional setup operations + import warnings + + warnings.filterwarnings( + "ignore", + message=".*Accessing the database during app initialization.*", + category=RuntimeWarning, + ) + + try: + from django.core.management import call_command + + if in_memory_db: + raise Exception("dont use this anymore") + + # some commands dont store a long-lived sqlite3 db file on disk. + # in those cases we create a temporary in-memory db and run the migrations + # immediately to get a usable in-memory-database at startup + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + django.setup() + + call_command("migrate", interactive=False, verbosity=0) + else: + # Otherwise use default sqlite3 file-based database and initialize django + # without running migrations automatically (user runs them manually by calling init) + try: + django.setup() + except Exception as e: + is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version")) + if not is_using_meta_cmd: + # show error message to user only if they're not running a meta command / just trying to get help + STDERR.print() + STDERR.print( + Panel( + f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n", + title="\n\n[red][X] Error while trying to load database![/red]", + subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]", + expand=False, + style="bold red", + ), + ) + STDERR.print() + import traceback + + traceback.print_exc() + return + + from django.conf import settings + from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG + + # log startup message to the error log + error_log = getattr(settings, "ERROR_LOG", DEFAULT_ERROR_LOG) + with open(error_log, "a", encoding="utf-8") as f: + command = " ".join(sys.argv) + ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S") + f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") + + if check_db: + # make sure the data dir is owned by a non-root user + if CONSTANTS.DATA_DIR.stat().st_uid == 0: + STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]") + STDERR.print(f" {CONSTANTS.DATA_DIR}") + STDERR.print() + STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)") + STDERR.print(" cd path/to/your/archive/data") + STDERR.print(" archivebox [command]") + STDERR.print() + raise SystemExit(9) + + # Create cache table in DB if needed + try: + from django.core.cache import cache + + cache.get("test", None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) + + # if archivebox gets imported multiple times, we have to close + # the sqlite3 whenever we init from scratch to avoid multiple threads + # sharing the same connection by accident + from django.db import connections + + for conn in connections.all(): + conn.close_if_unusable_or_obsolete() + + sql_index_path = CONSTANTS.DATABASE_FILE + assert os.access(sql_index_path, os.F_OK), ( + f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)" + ) + + # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging + # if settings.DEBUG_LOGFIRE: + # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor + # SQLite3Instrumentor().instrument() + + # import logfire + + # logfire.configure() + # logfire.instrument_django(is_sql_commentor_enabled=True) + # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv) + + except KeyboardInterrupt: + raise SystemExit(2) + + DJANGO_SET_UP = True diff --git a/archivebox/config/ldap.py b/archivebox/config/ldap.py new file mode 100644 index 0000000000..a2eadee3e8 --- /dev/null +++ b/archivebox/config/ldap.py @@ -0,0 +1,56 @@ +__package__ = "archivebox.config" + +from pydantic import Field + +from archivebox.config.configset import BaseConfigSet + + +class LDAPConfig(BaseConfigSet): + """ + LDAP authentication configuration. + + Only loads and validates if django-auth-ldap is installed. + These settings integrate with Django's LDAP authentication backend. + """ + + toml_section_header: str = "LDAP_CONFIG" + + LDAP_ENABLED: bool = Field(default=False) + LDAP_SERVER_URI: str | None = Field(default=None) + LDAP_BIND_DN: str | None = Field(default=None) + LDAP_BIND_PASSWORD: str | None = Field(default=None) + LDAP_USER_BASE: str | None = Field(default=None) + LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)") + LDAP_USERNAME_ATTR: str = Field(default="username") + LDAP_FIRSTNAME_ATTR: str = Field(default="givenName") + LDAP_LASTNAME_ATTR: str = Field(default="sn") + LDAP_EMAIL_ATTR: str = Field(default="mail") + LDAP_CREATE_SUPERUSER: bool = Field(default=False) + + def validate_ldap_config(self) -> tuple[bool, str]: + """ + Validate that all required LDAP settings are configured. + + Returns: + Tuple of (is_valid, error_message) + """ + if not self.LDAP_ENABLED: + return True, "" + + required_fields = [ + "LDAP_SERVER_URI", + "LDAP_BIND_DN", + "LDAP_BIND_PASSWORD", + "LDAP_USER_BASE", + ] + + missing = [field for field in required_fields if not getattr(self, field)] + + if missing: + return False, f"LDAP_* config options must all be set if LDAP_ENABLED=True\nMissing: {', '.join(missing)}" + + return True, "" + + +# Singleton instance +LDAP_CONFIG = LDAPConfig() diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py new file mode 100644 index 0000000000..59885dcc68 --- /dev/null +++ b/archivebox/config/paths.py @@ -0,0 +1,476 @@ +__package__ = "archivebox.config" + +import os +import socket +import hashlib +import tempfile +import platform +from pathlib import Path +from functools import cache +from datetime import datetime + +from benedict import benedict + +from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER + +############################################################################################# + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir + +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + +DATABASE_FILE = DATA_DIR / "index.sqlite3" + +############################################################################################# + + +def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: + collection_id_file = DATA_DIR / ".archivebox_id" + + try: + return collection_id_file.read_text().strip() + except (OSError, FileNotFoundError, PermissionError): + pass + + # hash the machine_id + collection dir path + creation time to get a unique collection_id + machine_id = get_machine_id() + collection_path = DATA_DIR.resolve() + try: + creation_date = DATA_DIR.stat().st_ctime + except Exception: + creation_date = datetime.now().isoformat() + collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8] + + try: + # only persist collection_id file if we already have an index.sqlite3 file present + # otherwise we might be running in a directory that is not a collection, no point creating cruft files + collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK) + if collection_is_active or force_create: + collection_id_file.write_text(collection_id) + + # if we're running as root right now, make sure the collection_id file is owned by the archivebox user + if IS_ROOT: + with SudoPermission(uid=0): + if ARCHIVEBOX_USER == 0: + os.system(f'chmod 777 "{collection_id_file}"') + else: + os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"') + except (OSError, FileNotFoundError, PermissionError): + pass + return collection_id + + +@cache +def get_collection_id(DATA_DIR=DATA_DIR) -> str: + """Get a short, stable, unique ID for the current collection (e.g. abc45678)""" + return _get_collection_id(DATA_DIR=DATA_DIR) + + +@cache +def get_machine_id() -> str: + """Get a short, stable, unique ID for the current machine (e.g. abc45678)""" + + MACHINE_ID = "unknown" + try: + import machineid + + MACHINE_ID = machineid.hashed_id("archivebox")[:8] + except Exception: + try: + import uuid + import hashlib + + MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8] + except Exception: + pass + return MACHINE_ID + + +@cache +def get_machine_type() -> str: + """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)""" + + OS: str = platform.system().lower() # darwin, linux, etc. + ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. + LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}" + return LIB_DIR_SCOPE + + +def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool: + """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" + current_uid, current_gid = os.geteuid(), os.getegid() + uid, gid = uid or current_uid, gid or current_gid + + test_file = dir_path / ".permissions_test" + try: + with SudoPermission(uid=uid, fallback=fallback): + test_file.exists() + test_file.write_text(f"Checking if PUID={uid} PGID={gid} can write to dir") + test_file.unlink() + return True + except (OSError, PermissionError): + if chown: + # try fixing it using sudo permissions + with SudoPermission(uid=uid, fallback=fallback): + os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null') + return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) + return False + + +def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: + """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" + from archivebox.misc.logging_util import pretty_path + + try: + socket_path = str(dir_path / ".test_socket.sock") + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + os.remove(socket_path) + except OSError: + pass + s.bind(socket_path) + s.close() + try: + os.remove(socket_path) + except OSError: + pass + except Exception as e: + raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e + + return True + + +def create_and_chown_dir(dir_path: Path) -> None: + with SudoPermission(uid=0, fallback=True): + dir_path.mkdir(parents=True, exist_ok=True) + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null') + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') + + +def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool: + socket_file = dir_path.absolute().resolve() / "supervisord.sock" + return len(f"file://{socket_file}") <= 96 + + +@cache +def get_or_create_working_tmp_dir(autofix=True, quiet=True): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_tmp_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.TMP_DIR, # + CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/ + Path("/var/run/archivebox") / get_collection_id(), # /var/run/archivebox/abc5d8512 + Path("/tmp") / "archivebox" / get_collection_id(), # /tmp/archivebox/abc5d8512 + Path("~/.tmp/archivebox").expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 + Path(tempfile.gettempdir()) + / "archivebox" + / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 + Path(tempfile.gettempdir()) + / "archivebox" + / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d + Path(tempfile.gettempdir()) / "abx" / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 + ] + fallback_candidate = None + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.TMP_DIR != candidate: + STORAGE_CONFIG.update_in_place(TMP_DIR=candidate) + return candidate + try: + if ( + fallback_candidate is None + and candidate.exists() + and dir_is_writable(candidate) + and tmp_dir_socket_path_is_short_enough(candidate) + ): + fallback_candidate = candidate + except Exception: + pass + + # Some sandboxed environments disallow AF_UNIX binds entirely. + # Fall back to the shortest writable path so read-only CLI commands can still run, + # and let later permission checks surface the missing socket support if needed. + if fallback_candidate: + if autofix and STORAGE_CONFIG.TMP_DIR != fallback_candidate: + STORAGE_CONFIG.update_in_place(TMP_DIR=fallback_candidate) + return fallback_candidate + + if not quiet: + raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!") + + +@cache +def get_or_create_working_lib_dir(autofix=True, quiet=False): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_lib_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.LIB_DIR, # + CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker + Path("/usr/local/share/archivebox") / get_collection_id(), # /usr/local/share/archivebox/abc5 + *( + [Path("/opt/homebrew/share/archivebox") / get_collection_id()] if os.path.isfile("/opt/homebrew/bin/archivebox") else [] + ), # /opt/homebrew/share/archivebox/abc5 + Path("~/.local/share/archivebox").expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 + ] + + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.LIB_DIR != candidate: + STORAGE_CONFIG.update_in_place(LIB_DIR=candidate) + return candidate + + if not quiet: + raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!") + + +@cache +def get_data_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + try: + tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR + except Exception: + tmp_dir = STORAGE_CONFIG.TMP_DIR + + return benedict( + { + "DATA_DIR": { + "path": DATA_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), + "is_mount": os.path.ismount(DATA_DIR.resolve()), + }, + "CONFIG_FILE": { + "path": CONSTANTS.CONFIG_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) + and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) + and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), + }, + "SQL_INDEX": { + "path": DATABASE_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(DATABASE_FILE.resolve()), + }, + "ARCHIVE_DIR": { + "path": ARCHIVE_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), + "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), + }, + "SOURCES_DIR": { + "path": CONSTANTS.SOURCES_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) + and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) + and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), + }, + "PERSONAS_DIR": { + "path": CONSTANTS.PERSONAS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), + "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) + and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) + and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write + }, + "LOGS_DIR": { + "path": CONSTANTS.LOGS_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) + and os.access(CONSTANTS.LOGS_DIR, os.R_OK) + and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write + }, + "TMP_DIR": { + "path": tmp_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(tmp_dir) and os.access(tmp_dir, os.R_OK) and os.access(tmp_dir, os.W_OK), # read + write + }, + # "CACHE_DIR": { + # "path": CACHE_DIR.resolve(), + # "enabled": True, + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write + # }, + }, + ) + + +@cache +def get_code_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + try: + lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) or STORAGE_CONFIG.LIB_DIR + except Exception: + lib_dir = STORAGE_CONFIG.LIB_DIR + + lib_bin_dir = lib_dir / "bin" + + return benedict( + { + "PACKAGE_DIR": { + "path": (PACKAGE_DIR).resolve(), + "enabled": True, + "is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable + }, + "TEMPLATES_DIR": { + "path": CONSTANTS.TEMPLATES_DIR.resolve(), + "enabled": True, + "is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list + }, + "CUSTOM_TEMPLATES_DIR": { + "path": STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR.resolve(), + "enabled": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR), + "is_valid": os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) + and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK), # read + }, + "USER_PLUGINS_DIR": { + "path": CONSTANTS.USER_PLUGINS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), + "is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read + }, + "LIB_DIR": { + "path": lib_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write + }, + "LIB_BIN_DIR": { + "path": lib_bin_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(lib_bin_dir) + and os.access(lib_bin_dir, os.R_OK) + and os.access(lib_bin_dir, os.W_OK), # read + write + }, + }, + ) + + +# @cache +# def get_LIB_DIR(): +# """ +# - should be shared with other collections on the same host +# - must be scoped by CPU architecture, OS family, and archivebox version +# - should not be shared with other hosts/archivebox versions +# - must be writable by any archivebox user +# - should be persistent across reboots +# - can be on a docker bin mount but probably shouldnt be +# - ok to have a long path (doesnt contain SOCKETS) +# """ +# from .version import detect_installed_version + +# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) + +# lib_dir = tempfile.gettempdir() +# try: +# if 'SYSTEM_LIB_DIR' in os.environ: +# lib_dir = Path(os.environ['SYSTEM_LIB_DIR']) +# else: +# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): +# lib_dir = HOST_DIRS.site_data_path + +# # Docker: /usr/local/share/archivebox/0.8.5 +# # Ubuntu: /usr/local/share/archivebox/0.8.5 +# # macOS: /Library/Application Support/archivebox +# try: +# with SudoPermission(uid=0, fallback=True): +# lib_dir.mkdir(parents=True, exist_ok=True) +# except PermissionError: +# # our user cannot +# lib_dir = HOST_DIRS.user_data_path +# lib_dir.mkdir(parents=True, exist_ok=True) + +# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# # make sure lib dir is owned by the archivebox user, not root +# with SudoPermission(uid=0): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) +# os.system(f'chmod -R 777 "{lib_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') +# else: +# raise PermissionError() +# except (PermissionError, AssertionError): +# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') +# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) + +# return lib_dir + +# @cache +# def get_TMP_DIR(): +# """ +# - must NOT be inside DATA_DIR / inside a docker volume bind mount +# - must NOT have a long PATH (UNIX socket path length restrictions) +# - must NOT be shared with other collections/hosts +# - must be writable by archivebox user & root +# - must be cleared on every boot / not persisted +# - must be cleared on every archivebox version upgrade +# """ +# from .version import detect_installed_version + +# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) + +# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) +# # print('RUNNING AS:', self.PUID, self.PGID) +# run_dir = tempfile.gettempdir() +# try: +# if 'SYSTEM_TMP_DIR' in os.environ: +# run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR) +# with SudoPermission(uid=0, fallback=True): +# run_dir.mkdir(parents=True, exist_ok=True) +# if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# with SudoPermission(uid=0, fallback=False): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# os.system(f'chmod -R 777 "{run_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') +# else: +# raise PermissionError() +# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' +# return run_dir + +# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() +# try: +# assert len(str(run_dir)) + len('/supervisord.sock') < 95 +# except AssertionError: +# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) +# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' + +# with SudoPermission(uid=0, fallback=True): +# run_dir.mkdir(parents=True, exist_ok=True) + +# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): +# if IS_ROOT: +# with SudoPermission(uid=0): +# if ARCHIVEBOX_USER == 0: +# # print(f'[yellow]:warning: Warning: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) +# os.system(f'chmod -R 777 "{run_dir}"') +# else: +# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') +# else: +# raise PermissionError() + +# except (PermissionError, AssertionError): +# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') +# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) + +# return run_dir diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py new file mode 100644 index 0000000000..45afb3ca26 --- /dev/null +++ b/archivebox/config/permissions.py @@ -0,0 +1,143 @@ +__package__ = "archivebox.config" + +import os +import pwd +import sys +import socket +import platform +from typing import cast + +from rich import print + +from pathlib import Path +from contextlib import contextmanager + +############################################################################################# + +DATA_DIR = Path(os.getcwd()) + +try: + DATA_DIR_STAT = DATA_DIR.stat() + DATA_DIR_UID = DATA_DIR_STAT.st_uid + DATA_DIR_GID = DATA_DIR_STAT.st_gid +except PermissionError: + DATA_DIR_UID = 0 + DATA_DIR_GID = 0 + +DEFAULT_PUID = 911 +DEFAULT_PGID = 911 +RUNNING_AS_UID = os.getuid() +RUNNING_AS_GID = os.getgid() +EUID = os.geteuid() +EGID = os.getegid() +SUDO_UID = int(os.environ.get("SUDO_UID", 0)) +SUDO_GID = int(os.environ.get("SUDO_GID", 0)) +USER: str = Path("~").expanduser().resolve().name +HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len)) + +IS_ROOT = RUNNING_AS_UID == 0 +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") +# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose + + +FALLBACK_UID = RUNNING_AS_UID or SUDO_UID +FALLBACK_GID = RUNNING_AS_GID or SUDO_GID +if RUNNING_AS_UID == 0: + try: + # if we are running as root it's really hard to figure out what the correct archivebox user should be + # as a last resort instead of setting DATA_DIR ownership to 0:0 (which breaks it for non-root users) + # check if 911:911 archivebox user exists on host system, and use it instead of 0 + if pwd.getpwuid(DEFAULT_PUID).pw_name == "archivebox": + FALLBACK_UID = DEFAULT_PUID + FALLBACK_GID = DEFAULT_PGID + except Exception: + pass + + +os.environ.setdefault("PUID", str(DATA_DIR_UID or EUID or RUNNING_AS_UID or FALLBACK_UID)) +os.environ.setdefault("PGID", str(DATA_DIR_GID or EGID or RUNNING_AS_GID or FALLBACK_GID)) + +ARCHIVEBOX_USER = int(os.environ["PUID"]) +ARCHIVEBOX_GROUP = int(os.environ["PGID"]) +if not USER: + try: + # alternative method 1 to get username + USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name + except Exception: + pass + +if not USER: + try: + # alternative method 2 to get username + import getpass + + USER = getpass.getuser() + except Exception: + pass + +if not USER: + try: + # alternative method 3 to get username + USER = os.getlogin() or "archivebox" + except Exception: + USER = "archivebox" + +ARCHIVEBOX_USER_EXISTS = False +try: + pwd.getpwuid(ARCHIVEBOX_USER) + ARCHIVEBOX_USER_EXISTS = True +except Exception: + ARCHIVEBOX_USER_EXISTS = False + + +############################################################################################# + + +def drop_privileges(): + """If running as root, drop privileges to the user that owns the data dir (or PUID)""" + + # always run archivebox as the user that owns the data dir, never as root + if os.getuid() == 0: + # drop permissions to the user that owns the data dir / provided PUID + if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS: + # drop our effective UID to the archivebox user's UID + os.seteuid(ARCHIVEBOX_USER) + + # update environment variables so that subprocesses dont try to write to /root + pw_record = pwd.getpwuid(ARCHIVEBOX_USER) + os.environ["HOME"] = pw_record.pw_dir + os.environ["LOGNAME"] = pw_record.pw_name + os.environ["USER"] = pw_record.pw_name + + if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS: + print( + "[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)", + file=sys.stderr, + ) + + +@contextmanager +def SudoPermission(uid=0, fallback=False): + """Attempt to run code with sudo permissions for a given user (or root)""" + + if os.geteuid() == uid: + # no need to change effective UID, we are already that user + yield + return + + try: + # change our effective UID to the given UID + os.seteuid(uid) + except PermissionError as err: + if not fallback: + raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err + try: + # yield back to the caller so they can run code inside context as root + yield + finally: + # then set effective UID back to DATA_DIR owner + try: + os.seteuid(ARCHIVEBOX_USER) + except PermissionError as err: + if not fallback: + raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err diff --git a/archivebox/config/version.py b/archivebox/config/version.py new file mode 100644 index 0000000000..fde5533d2e --- /dev/null +++ b/archivebox/config/version.py @@ -0,0 +1,125 @@ +__package__ = "archivebox.config" + +import os +import importlib.metadata + +from pathlib import Path +from functools import cache +from datetime import datetime + +############################################################################################# + +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.environ.get("DATA_DIR", os.getcwd())).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir + +############################################################################################# + + +@cache +def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR): + """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file""" + try: + # if in production install, use pip-installed package metadata + return importlib.metadata.version("archivebox").strip() + except importlib.metadata.PackageNotFoundError: + pass + + try: + # if in dev Git repo dir, use pyproject.toml file + pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n") + for line in pyproject_config: + if line.startswith("version = "): + return line.split(" = ", 1)[-1].strip('"').strip() + except FileNotFoundError: + # building docs, pyproject.toml is not available + pass + + # raise Exception('Failed to detect installed archivebox version!') + return "dev" + + +@cache +def get_COMMIT_HASH() -> str | None: + try: + git_dir = PACKAGE_DIR.parent / ".git" + ref = (git_dir / "HEAD").read_text().strip().split(" ")[-1] + commit_hash = git_dir.joinpath(ref).read_text().strip() + return commit_hash + except Exception: + pass + + try: + return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip() + except Exception: + pass + + return None + + +@cache +def get_BUILD_TIME() -> str: + if IN_DOCKER: + try: + # if we're in the archivebox official docker image, /VERSION.txt will contain the build time + docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0] + return docker_build_end_time + except Exception: + pass + + src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime + return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s") + + +# def get_versions_available_on_github(config): +# """ +# returns a dictionary containing the ArchiveBox GitHub release info for +# the recommended upgrade version and the currently installed version +# """ + +# # we only want to perform the (relatively expensive) check for new versions +# # when its most relevant, e.g. when the user runs a long-running command +# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' +# long_running_commands = ('add', 'schedule', 'update', 'status', 'server') +# if subcommand_run_by_user not in long_running_commands: +# return None + +# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" +# response = requests.get(github_releases_api) +# if response.status_code != 200: +# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) +# return None +# all_releases = response.json() + +# installed_version = parse_version_string(config['VERSION']) + +# # find current version or nearest older version (to link to) +# current_version = None +# for idx, release in enumerate(all_releases): +# release_version = parse_version_string(release['tag_name']) +# if release_version <= installed_version: +# current_version = release +# break + +# current_version = current_version or all_releases[-1] + +# # recommended version is whatever comes after current_version in the release list +# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) +# try: +# recommended_version = all_releases[idx+1] +# except IndexError: +# recommended_version = None + +# return {'recommended_version': recommended_version, 'current_version': current_version} + +# def can_upgrade(config): +# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: +# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) +# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) +# return recommended_version > current_version +# return False + + +VERSION: str = detect_installed_version() diff --git a/archivebox/config/views.py b/archivebox/config/views.py new file mode 100644 index 0000000000..069ee6dad5 --- /dev/null +++ b/archivebox/config/views.py @@ -0,0 +1,846 @@ +__package__ = "archivebox.config" + +import html +import json +import os +import inspect +import re +from pathlib import Path +from typing import Any +from collections.abc import Callable +from urllib.parse import quote, urlencode +from django.http import HttpRequest +from django.utils import timezone +from django.utils.html import format_html +from django.utils.safestring import mark_safe + +from admin_data_views.typing import TableContext, ItemContext, SectionData +from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink + +from archivebox.config import CONSTANTS +from archivebox.misc.util import parse_date + +from archivebox.machine.models import Binary + +ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/" +ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/" +LIVE_CONFIG_BASE_URL = "/admin/environment/config/" +ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/" +INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/" + + +def is_superuser(request: HttpRequest) -> bool: + return bool(getattr(request.user, "is_superuser", False)) + + +def format_parsed_datetime(value: object) -> str: + parsed = parse_date(value) + return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else "" + + +JSON_TOKEN_RE = re.compile( + r'(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)' + r'|(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")' + r"|(?P\btrue\b|\bfalse\b)" + r"|(?P\bnull\b)" + r"|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)", +) + + +def render_code_block(text: str, *, highlighted: bool = False) -> str: + code = html.escape(text, quote=False) + + if highlighted: + + def _wrap_token(match: re.Match[str]) -> str: + styles = { + "key": "color: #0550ae;", + "string": "color: #0a7f45;", + "boolean": "color: #8250df; font-weight: 600;", + "null": "color: #6e7781; font-style: italic;", + "number": "color: #b35900;", + } + token_type = next(name for name, value in match.groupdict().items() if value is not None) + return f'{match.group(0)}' + + code = JSON_TOKEN_RE.sub(_wrap_token, code) + + return ( + '
'
+        '"
+        f"{code}"
+        "
" + ) + + +def render_highlighted_json_block(value: Any) -> str: + return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True) + + +def get_plugin_docs_url(plugin_name: str) -> str: + return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}" + + +def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str: + return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}" + + +def get_live_config_url(key: str) -> str: + return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/" + + +def get_environment_binary_url(name: str) -> str: + return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/" + + +def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | None: + if binary is None or not binary.id: + return None + + base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/" + changelist_filters = urlencode({"q": name}) + return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}" + + +def get_machine_admin_url() -> str | None: + try: + from archivebox.machine.models import Machine + + return Machine.current().admin_change_url + except Exception: + return None + + +def render_code_tag_list(values: list[str]) -> str: + if not values: + return '(none)' + + tags = "".join( + str( + format_html( + '{}', + value, + ), + ) + for value in values + ) + return f'
{tags}
' + + +def render_plugin_metadata_html(config: dict[str, Any]) -> str: + required_binaries = [ + str(item.get("name")) for item in (config.get("required_binaries") or []) if isinstance(item, dict) and item.get("name") + ] + rows = ( + ("Title", config.get("title") or "(none)"), + ("Description", config.get("description") or "(none)"), + ("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))), + ("Required Binaries", mark_safe(render_link_tag_list(required_binaries, get_environment_binary_url))), + ("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))), + ) + + rendered_rows = "".join( + str( + format_html( + '
{}
{}
', + label, + value, + ), + ) + for label, value in rows + ) + return f'
{rendered_rows}
' + + +def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str: + if not values: + return '(none)' + + tags = [] + for value in values: + if url_resolver is None: + tags.append( + str( + format_html( + '{}', + value, + ), + ), + ) + else: + tags.append( + str( + format_html( + '' + '{}' + "", + url_resolver(value), + value, + ), + ), + ) + return f'
{"".join(tags)}
' + + +def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str: + links = [ + str(format_html('Computed value', get_live_config_url(prop_name))), + ] + if machine_admin_url: + links.append(str(format_html('Edit override', machine_admin_url))) + + fallback = prop_info.get("x-fallback") + if isinstance(fallback, str) and fallback: + links.append(str(format_html('Fallback: {}', get_live_config_url(fallback), fallback))) + + aliases = prop_info.get("x-aliases") or [] + if isinstance(aliases, list): + for alias in aliases: + if isinstance(alias, str) and alias: + links.append(str(format_html('Alias: {}', get_live_config_url(alias), alias))) + + default = prop_info.get("default") + if prop_name.endswith("_BINARY") and isinstance(default, str) and default: + links.append(str(format_html('Binary: {}', get_environment_binary_url(default), default))) + + return "   ".join(links) + + +def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str: + header_links = [ + str(format_html('Dependencies', ENVIRONMENT_BINARIES_BASE_URL)), + str(format_html('Installed Binaries', INSTALLED_BINARIES_BASE_URL)), + ] + if machine_admin_url: + header_links.insert(0, str(format_html('Machine Config Editor', machine_admin_url))) + + cards = [ + f'
{"   |   ".join(header_links)}
', + ] + + for prop_name, prop_info in properties.items(): + prop_type = prop_info.get("type", "unknown") + if isinstance(prop_type, list): + prop_type = " | ".join(str(type_name) for type_name in prop_type) + prop_desc = prop_info.get("description", "") + + default_html = "" + if "default" in prop_info: + default_html = str( + format_html( + '
Default: {}
', + prop_info["default"], + ), + ) + + description_html = prop_desc or mark_safe('(no description)') + cards.append( + str( + format_html( + '
' + '
' + '{}' + ' ({})' + "
" + '
{}
' + '
{}
' + "{}" + "
", + get_live_config_url(prop_name), + prop_name, + prop_type, + description_html, + mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)), + mark_safe(default_html), + ), + ), + ) + + return "".join(cards) + + +def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str: + if not hooks: + return '(none)' + + items = [] + for hook_name in hooks: + if source == "builtin": + items.append( + str( + format_html( + '', + get_plugin_hook_source_url(plugin_name, hook_name), + hook_name, + ), + ), + ) + else: + items.append( + str( + format_html( + '
{}
', + hook_name, + ), + ), + ) + return "".join(items) + + +def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str: + installed_binary_url = get_installed_binary_change_url(name, db_binary) + + if installed_binary_url: + return str( + format_html( + '{}
View Installed Binary Record', + merged["abspath"], + installed_binary_url, + ), + ) + + return str(format_html("{}", merged["abspath"])) + + +def obj_to_yaml(obj: Any, indent: int = 0) -> str: + indent_str = " " * indent + if indent == 0: + indent_str = "\n" # put extra newline between top-level entries + + if isinstance(obj, dict): + if not obj: + return "{}" + result = "\n" + for key, value in obj.items(): + result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n" + return result + + elif isinstance(obj, list): + if not obj: + return "[]" + result = "\n" + for item in obj: + result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n" + return result.rstrip() + + elif isinstance(obj, str): + if "\n" in obj: + return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ") + else: + return f" {obj}" + + elif isinstance(obj, (int, float, bool)): + return f" {str(obj)}" + + elif callable(obj): + source = ( + "\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip()) + .split("lambda: ")[-1] + .rstrip(",") + ) + return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") + + else: + return f" {str(obj)}" + + +def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]: + return ( + int(binary.status == Binary.StatusChoices.INSTALLED), + int(bool(binary.version)), + int(bool(binary.abspath)), + binary.modified_at, + ) + + +def get_db_binaries_by_name() -> dict[str, Binary]: + grouped: dict[str, list[Binary]] = {} + binary_name_aliases = { + "youtube-dl": "yt-dlp", + } + for binary in Binary.objects.all(): + canonical_name = binary_name_aliases.get(binary.name, binary.name) + grouped.setdefault(canonical_name, []).append(binary) + + return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()} + + +def get_filesystem_plugins() -> dict[str, dict[str, Any]]: + """Discover plugins from filesystem directories.""" + import json + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR + + plugins = {} + + for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]: + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): + plugin_id = f"{source}.{plugin_dir.name}" + + # Find hook scripts + hooks = [] + for ext in ("sh", "py", "js"): + hooks.extend(plugin_dir.glob(f"on_*__*.{ext}")) + + # Load config.json if it exists + config_file = plugin_dir / "config.json" + config_data = None + if config_file.exists(): + try: + with open(config_file) as f: + config_data = json.load(f) + except (json.JSONDecodeError, OSError): + config_data = None + + plugins[plugin_id] = { + "id": plugin_id, + "name": plugin_dir.name, + "path": str(plugin_dir), + "source": source, + "hooks": [str(h.name) for h in hooks], + "config": config_data, + } + + return plugins + + +@render_with_table_view +def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Binary Name": [], + "Found Version": [], + "Provided By": [], + "Found Abspath": [], + } + + db_binaries = get_db_binaries_by_name() + all_binary_names = sorted(db_binaries.keys()) + + for name in all_binary_names: + binary = db_binaries.get(name) + binary_is_valid = bool(binary and getattr(binary, "is_valid", getattr(binary, "abspath", None))) + + rows["Binary Name"].append(ItemLink(name, key=name)) + + if binary_is_valid: + rows["Found Version"].append(f"✅ {binary.version}" if binary.version else "✅ found") + rows["Provided By"].append(binary.binprovider or "-") + rows["Found Abspath"].append(binary.abspath or "-") + else: + rows["Found Version"].append("❌ missing") + rows["Provided By"].append("-") + rows["Found Abspath"].append("-") + + return TableContext( + title="Binaries", + table=rows, + ) + + +@render_with_item_view +def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + key = { + "youtube-dl": "yt-dlp", + }.get(key, key) + db_binary = get_db_binaries_by_name().get(key) + binary_is_valid = bool(db_binary and getattr(db_binary, "is_valid", getattr(db_binary, "abspath", None))) + if binary_is_valid: + binary_data = db_binary.to_json() if hasattr(db_binary, "to_json") else db_binary.__dict__ + section: SectionData = { + "name": key, + "description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)), + "fields": { + "name": key, + "binprovider": db_binary.binprovider or "-", + "abspath": db_binary.abspath or "not found", + "version": db_binary.version or "unknown", + "sha256": db_binary.sha256, + "status": db_binary.status, + }, + "help_texts": {}, + } + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + section: SectionData = { + "name": key, + "description": "No persisted Binary record found", + "fields": { + "name": key, + "binprovider": db_binary.binprovider if db_binary else "not recorded", + "abspath": db_binary.abspath if db_binary else "not recorded", + "version": db_binary.version if db_binary else "N/A", + "status": db_binary.status if db_binary else "unrecorded", + }, + "help_texts": {}, + } + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + +@render_with_table_view +def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Name": [], + "Source": [], + "Path": [], + "Hooks": [], + "Config": [], + } + + plugins = get_filesystem_plugins() + + for plugin_id, plugin in plugins.items(): + rows["Name"].append(ItemLink(plugin["name"], key=plugin_id)) + rows["Source"].append(plugin["source"]) + rows["Path"].append(format_html("{}", plugin["path"])) + rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)") + + # Show config status + if plugin.get("config"): + config_properties = plugin["config"].get("properties", {}) + config_count = len(config_properties) + rows["Config"].append(f"✅ {config_count} properties" if config_count > 0 else "✅ present") + else: + rows["Config"].append("❌ none") + + if not plugins: + # Show a helpful message when no plugins found + rows["Name"].append("(no plugins found)") + rows["Source"].append("-") + rows["Path"].append(mark_safe("abx_plugins/plugins/ or data/custom_plugins/")) + rows["Hooks"].append("-") + rows["Config"].append("-") + + return TableContext( + title="Installed plugins", + table=rows, + ) + + +@render_with_item_view +def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + plugins = get_filesystem_plugins() + + plugin = plugins.get(key) + if not plugin: + return ItemContext( + slug=key, + title=f"Plugin not found: {key}", + data=[], + ) + + # Base fields that all plugins have + docs_url = get_plugin_docs_url(plugin["name"]) + machine_admin_url = get_machine_admin_url() + fields = { + "id": plugin["id"], + "name": plugin["name"], + "source": plugin["source"], + } + + sections: list[SectionData] = [ + { + "name": plugin["name"], + "description": format_html( + '{}
ABX Plugin Docs', + plugin["path"], + docs_url, + ), + "fields": fields, + "help_texts": {}, + }, + ] + + if plugin["hooks"]: + sections.append( + { + "name": "Hooks", + "description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])), + "fields": {}, + "help_texts": {}, + }, + ) + + if plugin.get("config"): + sections.append( + { + "name": "Plugin Metadata", + "description": mark_safe(render_plugin_metadata_html(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + sections.append( + { + "name": "config.json", + "description": mark_safe(render_highlighted_json_block(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + config_properties = plugin["config"].get("properties", {}) + if config_properties: + sections.append( + { + "name": "Config Properties", + "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)), + "fields": {}, + "help_texts": {}, + }, + ) + + return ItemContext( + slug=key, + title=plugin["name"], + data=sections, + ) + + +@render_with_table_view +def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Name": [], + "State": [], + "PID": [], + "Started": [], + "Command": [], + "Logfile": [], + "Exit Status": [], + } + + from archivebox.workers.supervisord_util import get_existing_supervisord_process + + supervisor = get_existing_supervisord_process() + if supervisor is None: + return TableContext( + title="No running worker processes", + table=rows, + ) + + all_config: dict[str, dict[str, object]] = {} + config_items = supervisor.getAllConfigInfo() + if not isinstance(config_items, list): + config_items = [] + for config_data in config_items: + if not isinstance(config_data, dict): + continue + config_name = config_data.get("name") + if not isinstance(config_name, str): + continue + all_config[config_name] = config_data + + # Add top row for supervisord process manager + rows["Name"].append(ItemLink("supervisord", key="supervisord")) + supervisor_state = supervisor.getState() + rows["State"].append(str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else "")) + rows["PID"].append(str(supervisor.getPID())) + rows["Started"].append("-") + rows["Command"].append("supervisord --configuration=tmp/supervisord.conf") + rows["Logfile"].append( + format_html( + '{}', + "supervisord", + "logs/supervisord.log", + ), + ) + rows["Exit Status"].append("0") + + # Add a row for each worker process managed by supervisord + process_items = supervisor.getAllProcessInfo() + if not isinstance(process_items, list): + process_items = [] + for proc_data in process_items: + if not isinstance(proc_data, dict): + continue + proc_name = str(proc_data.get("name") or "") + proc_description = str(proc_data.get("description") or "") + proc_start = proc_data.get("start") + proc_logfile = str(proc_data.get("stdout_logfile") or "") + proc_config = all_config.get(proc_name, {}) + + rows["Name"].append(ItemLink(proc_name, key=proc_name)) + rows["State"].append(str(proc_data.get("statename") or "")) + rows["PID"].append(proc_description.replace("pid ", "")) + rows["Started"].append(format_parsed_datetime(proc_start)) + rows["Command"].append(str(proc_config.get("command") or "")) + rows["Logfile"].append( + format_html( + '{}', + proc_logfile.split("/")[-1].split(".")[0], + proc_logfile, + ), + ) + rows["Exit Status"].append(str(proc_data.get("exitstatus") or "")) + + return TableContext( + title="Running worker processes", + table=rows, + ) + + +@render_with_item_view +def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME + + SOCK_FILE = get_sock_file() + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + + supervisor = get_existing_supervisord_process() + if supervisor is None: + return ItemContext( + slug="none", + title="error: No running supervisord process.", + data=[], + ) + + all_config: list[dict[str, object]] = [] + config_items = supervisor.getAllConfigInfo() + if not isinstance(config_items, list): + config_items = [] + for config_data in config_items: + if isinstance(config_data, dict): + all_config.append(config_data) + + if key == "supervisord": + relevant_config = CONFIG_FILE.read_text() + relevant_logs = str(supervisor.readLog(0, 10_000_000)) + start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0] + start_dt = parse_date(start_ts) + uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else "" + supervisor_state = supervisor.getState() + + proc: dict[str, object] = { + "name": "supervisord", + "pid": supervisor.getPID(), + "statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""), + "start": start_ts, + "stop": None, + "exitstatus": "", + "stdout_logfile": "logs/supervisord.log", + "description": f"pid 000, uptime {uptime}", + } + else: + worker_data = get_worker(supervisor, key) + proc = worker_data if isinstance(worker_data, dict) else {} + relevant_config = next((config for config in all_config if config.get("name") == key), {}) + log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000) + relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result) + + section: SectionData = { + "name": key, + "description": key, + "fields": { + "Command": str(proc.get("name") or ""), + "PID": str(proc.get("pid") or ""), + "State": str(proc.get("statename") or ""), + "Started": format_parsed_datetime(proc.get("start")), + "Stopped": format_parsed_datetime(proc.get("stop")), + "Exit Status": str(proc.get("exitstatus") or ""), + "Logfile": str(proc.get("stdout_logfile") or ""), + "Uptime": str(str(proc.get("description") or "").split("uptime ", 1)[-1]), + "Config": obj_to_yaml(relevant_config) if isinstance(relevant_config, dict) else str(relevant_config), + "Logs": relevant_logs, + }, + "help_texts": {"Uptime": "How long the process has been running ([days:]hours:minutes:seconds)"}, + } + + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + +@render_with_table_view +def log_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + log_files: list[Path] = [] + for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]: + if isinstance(logfile, Path): + log_files.append(logfile) + + rows = { + "Name": [], + "Last Updated": [], + "Size": [], + "Most Recent Lines": [], + } + + # Add a row for each worker process managed by supervisord + for logfile in log_files: + st = logfile.stat() + rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name)) + rows["Last Updated"].append(format_parsed_datetime(st.st_mtime)) + rows["Size"].append(f"{st.st_size // 1000} kb") + + with open(logfile, "rb") as f: + try: + f.seek(-1024, os.SEEK_END) + except OSError: + f.seek(0) + last_lines = f.read().decode("utf-8", errors="replace").split("\n") + non_empty_lines = [line for line in last_lines if line.strip()] + rows["Most Recent Lines"].append(non_empty_lines[-1]) + + return TableContext( + title="Debug Log files", + table=rows, + ) + + +@render_with_item_view +def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0] + + log_text = log_file.read_text() + log_stat = log_file.stat() + + section: SectionData = { + "name": key, + "description": key, + "fields": { + "Path": str(log_file), + "Size": f"{log_stat.st_size // 1000} kb", + "Last Updated": format_parsed_datetime(log_stat.st_mtime), + "Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]), + "Full Log": log_text, + }, + } + + return ItemContext( + slug=key, + title=key, + data=[section], + ) diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py deleted file mode 100644 index f9c22a0c88..0000000000 --- a/archivebox/config_stubs.py +++ /dev/null @@ -1,112 +0,0 @@ -from pathlib import Path -from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List -from mypy_extensions import TypedDict - - - -SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]] -SimpleConfigValueDict = Dict[str, SimpleConfigValue] -SimpleConfigValueGetter = Callable[[], SimpleConfigValue] -ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter] - - -class BaseConfig(TypedDict): - pass - -class ConfigDict(BaseConfig, total=False): - """ - # Regenerate by pasting this quine into `archivebox shell` đŸĨš - from archivebox.config import ConfigDict, CONFIG_DEFAULTS - print('class ConfigDict(BaseConfig, total=False):') - print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3) - for section, configs in CONFIG_DEFAULTS.items(): - for key, attrs in configs.items(): - Type, default = attrs['type'], attrs['default'] - if default is None: - print(f' {key}: Optional[{Type.__name__}]') - else: - print(f' {key}: {Type.__name__}') - print() - """ - IS_TTY: bool - USE_COLOR: bool - SHOW_PROGRESS: bool - IN_DOCKER: bool - - PACKAGE_DIR: Path - OUTPUT_DIR: Path - CONFIG_FILE: Path - ONLY_NEW: bool - TIMEOUT: int - MEDIA_TIMEOUT: int - OUTPUT_PERMISSIONS: str - RESTRICT_FILE_NAMES: str - URL_BLACKLIST: str - - SECRET_KEY: Optional[str] - BIND_ADDR: str - ALLOWED_HOSTS: str - DEBUG: bool - PUBLIC_INDEX: bool - PUBLIC_SNAPSHOTS: bool - FOOTER_INFO: str - - SAVE_TITLE: bool - SAVE_FAVICON: bool - SAVE_WGET: bool - SAVE_WGET_REQUISITES: bool - SAVE_SINGLEFILE: bool - SAVE_READABILITY: bool - SAVE_MERCURY: bool - SAVE_PDF: bool - SAVE_SCREENSHOT: bool - SAVE_DOM: bool - SAVE_WARC: bool - SAVE_GIT: bool - SAVE_MEDIA: bool - SAVE_ARCHIVE_DOT_ORG: bool - - RESOLUTION: str - GIT_DOMAINS: str - CHECK_SSL_VALIDITY: bool - CURL_USER_AGENT: str - WGET_USER_AGENT: str - CHROME_USER_AGENT: str - COOKIES_FILE: Union[str, Path, None] - CHROME_USER_DATA_DIR: Union[str, Path, None] - CHROME_HEADLESS: bool - CHROME_SANDBOX: bool - - USE_CURL: bool - USE_WGET: bool - USE_SINGLEFILE: bool - USE_READABILITY: bool - USE_MERCURY: bool - USE_GIT: bool - USE_CHROME: bool - USE_YOUTUBEDL: bool - CURL_BINARY: str - GIT_BINARY: str - WGET_BINARY: str - SINGLEFILE_BINARY: str - READABILITY_BINARY: str - MERCURY_BINARY: str - YOUTUBEDL_BINARY: str - CHROME_BINARY: Optional[str] - - YOUTUBEDL_ARGS: List[str] - WGET_ARGS: List[str] - CURL_ARGS: List[str] - GIT_ARGS: List[str] - - -ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] -ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter] - -ConfigDefault = TypedDict('ConfigDefault', { - 'default': ConfigDefaultValue, - 'type': Optional[Type], - 'aliases': Optional[Tuple[str, ...]], -}, total=False) - -ConfigDefaultDict = Dict[str, ConfigDefault] diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py index 3e1d607ae4..f50f21bf7e 100644 --- a/archivebox/core/__init__.py +++ b/archivebox/core/__init__.py @@ -1 +1,29 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" +__order__ = 100 + + +def register_admin(admin_site): + """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site""" + from archivebox.core.admin import register_admin as do_register + + do_register(admin_site) + + +def get_CONFIG(): + from archivebox.config.common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + + return { + "SHELL_CONFIG": SHELL_CONFIG, + "STORAGE_CONFIG": STORAGE_CONFIG, + "GENERAL_CONFIG": GENERAL_CONFIG, + "SERVER_CONFIG": SERVER_CONFIG, + "ARCHIVING_CONFIG": ARCHIVING_CONFIG, + "SEARCHBACKEND_CONFIG": SEARCH_BACKEND_CONFIG, + } diff --git a/tests/__init__.py b/archivebox/core/actors.py similarity index 100% rename from tests/__init__.py rename to archivebox/core/actors.py diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 0329d9b053..9c95418307 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,426 +1,17 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" -from io import StringIO -from pathlib import Path -from contextlib import redirect_stdout -from datetime import datetime, timezone - -from django.contrib import admin -from django.urls import path -from django.utils.html import format_html -from django.utils.safestring import mark_safe -from django.shortcuts import render, redirect from django.contrib.auth import get_user_model -from django import forms - -from ..util import htmldecode, urldecode, ansi_to_html - -from core.models import Snapshot, ArchiveResult, Tag -from core.forms import AddLinkForm - -from core.mixins import SearchResultsAdminMixin - -from index.html import snapshot_icons -from logging_util import printable_filesize -from main import add, remove -from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE -from extractors import archive_links - -# Admin URLs -# /admin/ -# /admin/login/ -# /admin/core/ -# /admin/core/snapshot/ -# /admin/core/snapshot/:uuid/ -# /admin/core/tag/ -# /admin/core/tag/:uuid/ - - -# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel - - -class ArchiveResultInline(admin.TabularInline): - model = ArchiveResult - -class TagInline(admin.TabularInline): - model = Snapshot.tags.through - -from django.contrib.admin.helpers import ActionForm -from django.contrib.admin.widgets import AutocompleteSelectMultiple - -class AutocompleteTags: - model = Tag - search_fields = ['name'] - -class AutocompleteTagsAdminStub: - name = 'admin' - - -class SnapshotActionForm(ActionForm): - tags = forms.ModelMultipleChoiceField( - queryset=Tag.objects.all(), - required=False, - widget=AutocompleteSelectMultiple( - AutocompleteTags(), - AutocompleteTagsAdminStub(), - ), - ) - - # TODO: allow selecting actions for specific extractors? is this useful? - # EXTRACTOR_CHOICES = [ - # (name, name.title()) - # for name, _, _ in get_default_archive_methods() - # ] - # extractor = forms.ChoiceField( - # choices=EXTRACTOR_CHOICES, - # required=False, - # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) - # ) - - -class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): - list_display = ('added', 'title_str', 'files', 'size', 'url_str') - sort_fields = ('title_str', 'url_str', 'added', 'files') - readonly_fields = ('info', 'bookmarked', 'added', 'updated') - search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') - fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields) - list_filter = ('added', 'updated', 'tags', 'archiveresult__status') - ordering = ['-added'] - actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - autocomplete_fields = ['tags'] - inlines = [ArchiveResultInline] - list_per_page = SNAPSHOTS_PER_PAGE - - action_form = SnapshotActionForm - - def get_urls(self): - urls = super().get_urls() - custom_urls = [ - path('grid/', self.admin_site.admin_view(self.grid_view), name='grid') - ] - return custom_urls + urls - - def get_queryset(self, request): - self.request = request - return super().get_queryset(request).prefetch_related('tags') - - def tag_list(self, obj): - return ', '.join(obj.tags.values_list('name', flat=True)) - - # TODO: figure out a different way to do this, you cant nest forms so this doenst work - # def action(self, obj): - # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 - # # action: update_snapshots - # # select_across: 0 - # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 - # return format_html( - # ''' - #
- # - # - # - # - # - # - # - # - # ''', - # csrf.get_token(self.request), - # obj.id, - # ) - - def info(self, obj): - return format_html( - ''' - UUID: {}     - Timestamp: {}     - URL Hash: {}
- Archived: {} ({} files {})     - Favicon:     - Status code: {}     - Server: {}     - Content type: {}     - Extension: {}     -

- View Snapshot index âžĄī¸     - View actions âš™ī¸ - ''', - obj.id, - obj.timestamp, - obj.url_hash, - '✅' if obj.is_archived else '❌', - obj.num_outputs, - self.size(obj), - f'/archive/{obj.timestamp}/favicon.ico', - obj.status_code or '?', - obj.headers and obj.headers.get('Server') or '?', - obj.headers and obj.headers.get('Content-Type') or '?', - obj.extension or '?', - obj.timestamp, - obj.id, - ) - - def title_str(self, obj): - canon = obj.as_link().canonical_outputs() - tags = ''.join( - format_html('{} ', tag.id, tag) - for tag in obj.tags.all() - if str(tag).strip() - ) - return format_html( - '' - '' - '' - '' - '{}' - '', - obj.archive_path, - obj.archive_path, canon['favicon_path'], - obj.archive_path, - 'fetched' if obj.latest_title or obj.title else 'pending', - urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' - ) + mark_safe(f' {tags}') - - def files(self, obj): - return snapshot_icons(obj) - - files.admin_order_field = 'updated' - files.short_description = 'Files Saved' - - def size(self, obj): - archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size - if archive_size: - size_txt = printable_filesize(archive_size) - if archive_size > 52428800: - size_txt = mark_safe(f'{size_txt}') - else: - size_txt = mark_safe('...') - return format_html( - '{}', - obj.archive_path, - size_txt, - ) - - size.admin_order_field = 'archiveresult__count' - - def url_str(self, obj): - return format_html( - '{}', - obj.url, - obj.url, - ) - - def grid_view(self, request, extra_context=None): - - # cl = self.get_changelist_instance(request) - - # Save before monkey patching to restore for changelist list view - saved_change_list_template = self.change_list_template - saved_list_per_page = self.list_per_page - saved_list_max_show_all = self.list_max_show_all - - # Monkey patch here plus core_tags.py - self.change_list_template = 'private_index_grid.html' - self.list_per_page = SNAPSHOTS_PER_PAGE - self.list_max_show_all = self.list_per_page - - # Call monkey patched view - rendered_response = self.changelist_view(request, extra_context=extra_context) - - # Restore values - self.change_list_template = saved_change_list_template - self.list_per_page = saved_list_per_page - self.list_max_show_all = saved_list_max_show_all - - return rendered_response - - # for debugging, uncomment this to print all requests: - # def changelist_view(self, request, extra_context=None): - # print('[*] Got request', request.method, request.POST) - # return super().changelist_view(request, extra_context=None) - - def update_snapshots(self, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], out_dir=OUTPUT_DIR) - update_snapshots.short_description = "Pull" - - def update_titles(self, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR) - update_titles.short_description = "âŦ‡ī¸ Title" - - def resnapshot_snapshot(self, request, queryset): - for snapshot in queryset: - timestamp = datetime.now(timezone.utc).isoformat('T', 'seconds') - new_url = snapshot.url.split('#')[0] + f'#{timestamp}' - add(new_url, tag=snapshot.tags_str()) - resnapshot_snapshot.short_description = "Re-Snapshot" - - def overwrite_snapshots(self, request, queryset): - archive_links([ - snapshot.as_link() - for snapshot in queryset - ], overwrite=True, out_dir=OUTPUT_DIR) - overwrite_snapshots.short_description = "Reset" - - def delete_snapshots(self, request, queryset): - remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) - - delete_snapshots.short_description = "Delete" - - def add_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[+] Adding tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.add(*tags) - - add_tags.short_description = "+" - - def remove_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[-] Removing tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.remove(*tags) - - remove_tags.short_description = "–" - - - - title_str.short_description = 'Title' - url_str.short_description = 'Original URL' - - title_str.admin_order_field = 'title' - url_str.admin_order_field = 'url' - - - -class TagAdmin(admin.ModelAdmin): - list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') - sort_fields = ('id', 'name', 'slug') - readonly_fields = ('id', 'num_snapshots', 'snapshots') - search_fields = ('id', 'name', 'slug') - fields = (*readonly_fields, 'name', 'slug') - actions = ['delete_selected'] - ordering = ['-id'] - - def num_snapshots(self, obj): - return format_html( - '{} total', - obj.id, - obj.snapshot_set.count(), - ) - - def snapshots(self, obj): - total_count = obj.snapshot_set.count() - return mark_safe('
'.join( - format_html( - '{} [{}] {}', - snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', - snap.id, - snap.timestamp, - snap.url, - ) - for snap in obj.snapshot_set.order_by('-updated')[:10] - ) + (f'
and {total_count-10} more...' if obj.snapshot_set.count() > 10 else '')) - - -class ArchiveResultAdmin(admin.ModelAdmin): - list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str') - sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str') - search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version') - autocomplete_fields = ['snapshot'] - - list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') - ordering = ['-start_ts'] - list_per_page = SNAPSHOTS_PER_PAGE - - def snapshot_str(self, obj): - return format_html( - '[{}]
' - '{}', - obj.snapshot.timestamp, - obj.snapshot.timestamp, - obj.snapshot.url[:128], - ) - - def tags_str(self, obj): - return obj.snapshot.tags_str() - - def cmd_str(self, obj): - return format_html( - '
{}
', - ' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd), - ) - - def output_str(self, obj): - return format_html( - 'â†—ī¸
{}
', - obj.snapshot.timestamp, - obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', - obj.output, - ) - - tags_str.short_description = 'tags' - snapshot_str.short_description = 'snapshot' - -class ArchiveBoxAdmin(admin.AdminSite): - site_header = 'ArchiveBox' - index_title = 'Links' - site_title = 'Index' - - def get_urls(self): - return [ - path('core/snapshot/add/', self.add_view, name='Add'), - ] + super().get_urls() - - def add_view(self, request): - if not request.user.is_authenticated: - return redirect(f'/admin/login/?next={request.path}') - - request.current_app = self.name - context = { - **self.each_context(request), - 'title': 'Add URLs', - } - - if request.method == 'GET': - context['form'] = AddLinkForm() - elif request.method == 'POST': - form = AddLinkForm(request.POST) - if form.is_valid(): - url = form.cleaned_data["url"] - print(f'[+] Adding URL: {url}') - depth = 0 if form.cleaned_data["depth"] == "0" else 1 - input_kwargs = { - "urls": url, - "depth": depth, - "update_all": False, - "out_dir": OUTPUT_DIR, - } - add_stdout = StringIO() - with redirect_stdout(add_stdout): - add(**input_kwargs) - print(add_stdout.getvalue()) - context.update({ - "stdout": ansi_to_html(add_stdout.getvalue().strip()), - "form": AddLinkForm() - }) - else: - context["form"] = form +from archivebox.core.models import Snapshot, ArchiveResult, Tag +from archivebox.core.admin_tags import TagAdmin +from archivebox.core.admin_snapshots import SnapshotAdmin +from archivebox.core.admin_archiveresults import ArchiveResultAdmin +from archivebox.core.admin_users import CustomUserAdmin - return render(template_name='add.html', request=request, context=context) -admin.site = ArchiveBoxAdmin() -admin.site.register(get_user_model()) -admin.site.register(Snapshot, SnapshotAdmin) -admin.site.register(Tag, TagAdmin) -admin.site.register(ArchiveResult, ArchiveResultAdmin) -admin.site.disable_action('delete_selected') +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) + admin_site.register(ArchiveResult, ArchiveResultAdmin) + admin_site.register(Snapshot, SnapshotAdmin) + admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py new file mode 100644 index 0000000000..ba8affbb62 --- /dev/null +++ b/archivebox/core/admin_archiveresults.py @@ -0,0 +1,780 @@ +__package__ = "archivebox.core" + +import html +import json +import os +import shlex +from pathlib import Path +from urllib.parse import quote +from functools import reduce +from operator import and_ + +from django.contrib import admin +from django.db.models import Min, Q, TextField +from django.db.models.functions import Cast +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django.core.exceptions import ValidationError +from django.urls import reverse, resolve +from django.utils import timezone +from django.utils.text import smart_split + +from archivebox.config import DATA_DIR +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.base_models.admin import BaseModelAdmin +from archivebox.hooks import get_plugin_icon +from archivebox.core.host_utils import build_snapshot_url +from archivebox.core.widgets import InlineTagEditorWidget +from archivebox.core.views import LIVE_PLUGIN_BASE_URL +from archivebox.machine.env_utils import env_to_shell_exports + + +from archivebox.core.models import ArchiveResult, Snapshot + + +def _quote_shell_string(value: str) -> str: + return "'" + str(value).replace("'", "'\"'\"'") + "'" + + +def _get_replay_source_url(result: ArchiveResult) -> str: + process = getattr(result, "process", None) + return str(getattr(process, "url", None) or result.snapshot.url or "") + + +def build_abx_dl_display_command(result: ArchiveResult) -> str: + source_url = _get_replay_source_url(result) + plugin_name = str(result.plugin or "").strip() + if not plugin_name and not source_url: + return "abx-dl" + if not source_url: + return f"abx-dl --plugins={plugin_name}" + return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}" + + +def build_abx_dl_replay_command(result: ArchiveResult) -> str: + display_command = build_abx_dl_display_command(result) + process = getattr(result, "process", None) + env_items = env_to_shell_exports(getattr(process, "env", None) or {}) + snapshot_dir = shlex.quote(str(result.snapshot_dir)) + if env_items: + return f"cd {snapshot_dir}; env {env_items} {display_command}" + return f"cd {snapshot_dir}; {display_command}" + + +def get_plugin_admin_url(plugin_name: str) -> str: + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/" + + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" + + +def render_archiveresults_list(archiveresults_qs, limit=50): + """Render a nice inline list view of archive results with status, plugin, output, and actions.""" + + result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit]) + if not result_ids: + return mark_safe('
No Archive Results yet...
') + + results_by_id = { + result.pk: result + for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related("snapshot", "process", "process__machine") + } + results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id] + + if not results: + return mark_safe('
No Archive Results yet...
') + + # Status colors + status_colors = { + "succeeded": ("#166534", "#dcfce7"), # green + "failed": ("#991b1b", "#fee2e2"), # red + "queued": ("#6b7280", "#f3f4f6"), # gray + "started": ("#92400e", "#fef3c7"), # amber + "backoff": ("#92400e", "#fef3c7"), + "skipped": ("#475569", "#f1f5f9"), + "noresults": ("#475569", "#f1f5f9"), + } + + rows = [] + for idx, result in enumerate(results): + status = result.status or "queued" + color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6")) + output_files = result.output_files or {} + if isinstance(output_files, dict): + output_file_count = len(output_files) + elif isinstance(output_files, (list, tuple, set)): + output_file_count = len(output_files) + elif isinstance(output_files, str): + try: + parsed = json.loads(output_files) + output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0 + except Exception: + output_file_count = 0 + else: + output_file_count = 0 + + # Get plugin icon + icon = get_plugin_icon(result.plugin) + + # Format timestamp + end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-" + + process_display = "-" + if result.process_id and result.process: + process_display = f''' + {result.process.pid or "-"} + ''' + + machine_display = "-" + if result.process_id and result.process and result.process.machine_id: + machine_display = f''' + {result.process.machine.hostname} + ''' + + # Truncate output for display + full_output = result.output_str or "-" + output_display = full_output[:60] + if len(full_output) > 60: + output_display += "..." + + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result) + cmd_str_escaped = html.escape(display_cmd) + cmd_attr = html.escape(replay_cmd, quote=True) + + # Build output link - use embed_path() which checks output_files first + embed_path = result.embed_path() if hasattr(result, "embed_path") else None + snapshot_id = str(getattr(result, "snapshot_id", "")) + if embed_path and result.status == "succeeded": + output_link = build_snapshot_url(snapshot_id, embed_path) + else: + output_link = build_snapshot_url(snapshot_id, "") + + # Get version - try cmd_version field + version = result.cmd_version if result.cmd_version else "-" + + # Unique ID for this row's expandable output + row_id = f"output_{idx}_{str(result.id)[:8]}" + + rows.append(f''' +
+ + + + + + + + + + + + + + + + ''') + + total_count = archiveresults_qs.count() + footer = "" + if total_count > limit: + footer = f""" + + + + """ + + return mark_safe(f""" +
+
-brew install archivebox
-archivebox version +brew install archivebox
+archivebox version
-archivebox init
+archivebox init
-archivebox add +archivebox add -archivebox data dir +archivebox data dir
-archivebox server +archivebox server -archivebox server add +archivebox server add -archivebox server list +archivebox server list -archivebox server detail +archivebox server detail
+ + {str(result.id)[-8:]} + + + {status} + + {icon} + + + {result.plugin} + + + + {output_display} + + + {output_file_count} + + {end_time} + + {process_display} + + {machine_display} + + {version} + + +
+
+ + Details & Output + +
+
+ ID: {str(result.id)} + Version: {version} + PWD: {result.pwd or "-"} +
+
+ Output: +
+
{full_output}
+
+ Command: +
+
+ + {cmd_str_escaped} +
+
+
+
+ Showing {limit} of {total_count} results   + View all → +
+ + + + + + + + + + + + + + + + + {"".join(rows)} + {footer} + +
DetailsStatusPluginOutputFilesCompletedProcessMachineVersionActions
+
+ """) + + +class ArchiveResultInline(admin.TabularInline): + name = "Archive Results Log" + model = ArchiveResult + parent_model = Snapshot + # fk_name = 'snapshot' + extra = 0 + sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version") + readonly_fields = ("id", "result_id", "completed", "command", "version") + fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str") + # exclude = ('id',) + ordering = ("end_ts",) + show_change_link = True + # # classes = ['collapse'] + + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + try: + return self.parent_model.objects.get(pk=resolved.kwargs["object_id"]) + except (self.parent_model.DoesNotExist, ValidationError): + return None + + @admin.display( + description="Completed", + ordering="end_ts", + ) + def completed(self, obj): + return format_html('

{}

', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S")) + + def result_id(self, obj): + return format_html( + '[{}]', + reverse("admin:core_archiveresult_change", args=(obj.id,)), + str(obj.id)[:8], + ) + + def command(self, obj): + return format_html("{}", " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html("{}", obj.cmd_version or "-") + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + form_class = getattr(formset, "form", None) + base_fields = getattr(form_class, "base_fields", {}) + snapshot_output_dir = str(snapshot.output_dir) if snapshot else "" + + # import ipdb; ipdb.set_trace() + # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + base_fields["status"].initial = "succeeded" + base_fields["start_ts"].initial = timezone.now() + base_fields["end_ts"].initial = timezone.now() + base_fields["cmd_version"].initial = "-" + base_fields["pwd"].initial = snapshot_output_dir + base_fields["cmd"].initial = '["-"]' + base_fields["output_str"].initial = "Manually recorded cmd output..." + + if obj is not None: + # hidden values for existing entries and new entries + base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget() + base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget() + base_fields["cmd"].widget = base_fields["cmd"].hidden_widget() + base_fields["pwd"].widget = base_fields["pwd"].hidden_widget() + base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] + + +class ArchiveResultAdmin(BaseModelAdmin): + list_display = ( + "details_link", + "zip_link", + "created_at", + "snapshot_info", + "tags_inline", + "status_badge", + "plugin_with_icon", + "process_link", + "machine_link", + "cmd_str", + "output_str_display", + ) + list_display_links = None + sort_fields = ("id", "created_at", "plugin", "status") + readonly_fields = ( + "admin_actions", + "cmd", + "cmd_version", + "pwd", + "cmd_str", + "snapshot_info", + "tags_str", + "created_at", + "modified_at", + "output_summary", + "plugin_with_icon", + "process_link", + ) + search_fields = ( + "snapshot__id", + "snapshot__url", + "snapshot__tags__name", + "snapshot__crawl_id", + "plugin", + "hook_name", + "output_str", + "output_json", + "process__cmd", + ) + autocomplete_fields = ["snapshot"] + + fieldsets = ( + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot", "snapshot_info", "tags_str"), + "classes": ("card", "wide"), + }, + ), + ( + "Plugin", + { + "fields": ("plugin_with_icon", "process_link", "status"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("start_ts", "end_ts", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd", "cmd_str", "cmd_version", "pwd"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"), + "classes": ("card", "wide"), + }, + ), + ) + + list_filter = ("status", "plugin", "start_ts") + ordering = ["-start_ts"] + list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + + paginator = AcceleratedPaginator + save_on_top = True + + actions = ["delete_selected"] + + class Meta: + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results" + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + return super().change_view(request, object_id, form_url, extra_context) + + def changelist_view(self, request, extra_context=None): + self.request = request + return super().changelist_view(request, extra_context) + + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related("snapshot", "process") + .prefetch_related("snapshot__tags") + .annotate(snapshot_first_tag=Min("snapshot__tags__name")) + ) + + def get_search_results(self, request, queryset, search_term): + if not search_term: + return queryset, False + + queryset = queryset.annotate( + snapshot_id_text=Cast("snapshot__id", output_field=TextField()), + snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()), + output_json_text=Cast("output_json", output_field=TextField()), + cmd_text=Cast("process__cmd", output_field=TextField()), + ) + + search_bits = [ + bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term) + ] + search_bits = [bit.strip() for bit in search_bits if bit.strip()] + if not search_bits: + return queryset, False + + filters = [] + for bit in search_bits: + filters.append( + Q(snapshot_id_text__icontains=bit) + | Q(snapshot__url__icontains=bit) + | Q(snapshot__tags__name__icontains=bit) + | Q(snapshot_crawl_id_text__icontains=bit) + | Q(plugin__icontains=bit) + | Q(hook_name__icontains=bit) + | Q(output_str__icontains=bit) + | Q(output_json_text__icontains=bit) + | Q(cmd_text__icontains=bit), + ) + + return queryset.filter(reduce(and_, filters)).distinct(), True + + def get_snapshot_view_url(self, result: ArchiveResult) -> str: + return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None)) + + def get_output_view_url(self, result: ArchiveResult) -> str: + output_path = result.embed_path() if hasattr(result, "embed_path") else None + if not output_path: + output_path = result.plugin or "" + return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None)) + + def get_output_files_url(self, result: ArchiveResult) -> str: + return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1" + + def get_output_zip_url(self, result: ArchiveResult) -> str: + return f"{self.get_output_files_url(result)}&download=zip" + + @admin.display(description="Details", ordering="id") + def details_link(self, result): + return format_html( + '{}', + reverse("admin:core_archiveresult_change", args=[result.id]), + str(result.id)[-8:], + ) + + @admin.display(description="Zip") + def zip_link(self, result): + return format_html( + 'âŦ‡ ZIP', + self.get_output_zip_url(result), + ) + + @admin.display( + description="Snapshot", + ordering="snapshot__url", + ) + def snapshot_info(self, result): + snapshot_id = str(result.snapshot_id) + return format_html( + '[{}]   {}   {}
', + build_snapshot_url(snapshot_id, "index.html"), + snapshot_id[:8], + result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"), + result.snapshot.url[:128], + ) + + @admin.display( + description="Snapshot Tags", + ) + def tags_str(self, result): + return result.snapshot.tags_str() + + @admin.display(description="Tags", ordering="snapshot_first_tag") + def tags_inline(self, result): + widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False) + tags_html = widget.render( + name=f"tags_{result.snapshot_id}", + value=result.snapshot.tags.all(), + attrs={"id": f"tags_{result.snapshot_id}"}, + snapshot_id=str(result.snapshot_id), + ) + return mark_safe(f'{tags_html}') + + @admin.display(description="Status", ordering="status") + def status_badge(self, result): + status = result.status or ArchiveResult.StatusChoices.QUEUED + return format_html( + '{}', + status, + status, + result.get_status_display() or status, + ) + + @admin.display(description="Plugin", ordering="plugin") + def plugin_with_icon(self, result): + icon = get_plugin_icon(result.plugin) + return format_html( + '{} {}', + get_plugin_admin_url(result.plugin), + result.plugin, + icon, + get_plugin_admin_url(result.plugin), + result.plugin, + ) + + @admin.display(description="Process", ordering="process__pid") + def process_link(self, result): + if not result.process_id: + return "-" + process_label = result.process.pid if result.process and result.process.pid else "-" + return format_html( + '{}', + reverse("admin:machine_process_change", args=[result.process_id]), + process_label, + ) + + @admin.display(description="Machine", ordering="process__machine__hostname") + def machine_link(self, result): + if not result.process_id or not result.process or not result.process.machine_id: + return "-" + machine = result.process.machine + return format_html( + '{} {}', + reverse("admin:machine_machine_change", args=[machine.id]), + str(machine.id)[:8], + machine.hostname, + ) + + @admin.display(description="Command") + def cmd_str(self, result): + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result) + return format_html( + """ +
+ + + {} + +
+ """, + replay_cmd, + replay_cmd, + display_cmd, + ) + + def output_display(self, result): + # Determine output link path - use embed_path() which checks output_files + embed_path = result.embed_path() if hasattr(result, "embed_path") else None + output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html" + snapshot_id = str(result.snapshot_id) + return format_html( + 'â†—ī¸
{}
', + build_snapshot_url(snapshot_id, output_path), + result.output_str, + ) + + @admin.display(description="Output", ordering="output_str") + def output_str_display(self, result): + output_text = str(result.output_str or "").strip() + if not output_text: + return "-" + + live_path = result.embed_path() if hasattr(result, "embed_path") else None + if live_path: + return format_html( + '{}', + build_snapshot_url(str(result.snapshot_id), live_path), + output_text, + output_text, + ) + + return format_html( + '{}', + output_text, + output_text, + ) + + @admin.display(description="") + def admin_actions(self, result): + return format_html( + """ + + """, + self.get_output_view_url(result), + self.get_output_files_url(result), + self.get_output_zip_url(result), + self.get_snapshot_view_url(result), + ) + + def output_summary(self, result): + snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1] + output_html = format_html( + '
{}

', + result.output_str, + ) + snapshot_id = str(result.snapshot_id) + output_html += format_html( + 'See result files ...
',
+            build_snapshot_url(snapshot_id, "index.html"),
+        )
+        embed_path = result.embed_path() if hasattr(result, "embed_path") else ""
+        path_from_embed = snapshot_dir / (embed_path or "")
+        output_html += format_html(
+            '{}/{}

', + str(snapshot_dir), + str(embed_path), + ) + if os.access(path_from_embed, os.R_OK): + root_dir = str(path_from_embed) + else: + root_dir = str(snapshot_dir) + + # print(root_dir, str(list(os.walk(root_dir)))) + + for root, dirs, files in os.walk(root_dir): + depth = root.replace(root_dir, "").count(os.sep) + 1 + if depth > 2: + continue + indent = " " * 4 * (depth) + output_html += format_html('{}{}/
', indent, os.path.basename(root)) + indentation_str = " " * 4 * (depth + 1) + for filename in sorted(files): + is_hidden = filename.startswith(".") + output_html += format_html( + '{}{}
', + int(not is_hidden), + indentation_str, + filename.strip(), + ) + + return output_html + mark_safe("
") + + +def register_admin(admin_site): + admin_site.register(ArchiveResult, ArchiveResultAdmin) diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py new file mode 100644 index 0000000000..770a1d2a09 --- /dev/null +++ b/archivebox/core/admin_site.py @@ -0,0 +1,73 @@ +__package__ = "archivebox.core" + +from typing import TYPE_CHECKING, Any + +from django.contrib import admin +from admin_data_views.admin import ( + admin_data_index_view as adv_admin_data_index_view, + get_admin_data_urls as adv_get_admin_data_urls, + get_app_list as adv_get_app_list, +) + +if TYPE_CHECKING: + from django.http import HttpRequest + from django.template.response import TemplateResponse + from django.urls import URLPattern, URLResolver + + from admin_data_views.typing import AppDict + + +class ArchiveBoxAdmin(admin.AdminSite): + site_header = "ArchiveBox" + index_title = "Admin Views" + site_title = "Admin" + namespace = "admin" + + def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]: + if app_label is None: + return adv_get_app_list(self, request) + return adv_get_app_list(self, request, app_label) + + def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse": + return adv_admin_data_index_view(self, request, **kwargs) + + def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]: + return adv_get_admin_data_urls(self) + + def get_urls(self) -> list["URLResolver | URLPattern"]: + return self.get_admin_data_urls() + super().get_urls() + + +archivebox_admin = ArchiveBoxAdmin() +# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin +# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel + + +############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS ######### + + +def register_admin_site(): + """Replace the default admin site with our custom ArchiveBox admin site.""" + from django.contrib import admin + from django.contrib.admin import sites + + admin.site = archivebox_admin + sites.site = archivebox_admin + + # Register admin views for each app + # (Previously handled by ABX plugin system, now called directly) + from archivebox.core.admin import register_admin as register_core_admin + from archivebox.crawls.admin import register_admin as register_crawls_admin + from archivebox.api.admin import register_admin as register_api_admin + from archivebox.machine.admin import register_admin as register_machine_admin + from archivebox.personas.admin import register_admin as register_personas_admin + from archivebox.workers.admin import register_admin as register_workers_admin + + register_core_admin(archivebox_admin) + register_crawls_admin(archivebox_admin) + register_api_admin(archivebox_admin) + register_machine_admin(archivebox_admin) + register_personas_admin(archivebox_admin) + register_workers_admin(archivebox_admin) + + return archivebox_admin diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py new file mode 100644 index 0000000000..266ed974e9 --- /dev/null +++ b/archivebox/core/admin_snapshots.py @@ -0,0 +1,1144 @@ +__package__ = "archivebox.core" + +import json +from functools import lru_cache + +from django.contrib import admin, messages +from django.urls import path +from django.shortcuts import get_object_or_404, redirect +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django.db.models import Q, Sum, Count, Prefetch +from django.db.models.functions import Coalesce +from django import forms +from django.template import Template, RequestContext +from django.contrib.admin.helpers import ActionForm + +from archivebox.config import DATA_DIR +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.util import htmldecode, urldecode +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.misc.logging_util import printable_filesize +from archivebox.search.admin import SearchResultsAdminMixin +from archivebox.core.host_utils import build_snapshot_url, build_web_url +from archivebox.hooks import get_plugin_icon, get_plugin_name, get_plugins + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.workers.tasks import bg_archive_snapshots, bg_add + +from archivebox.core.models import Tag, Snapshot, ArchiveResult +from archivebox.core.admin_archiveresults import render_archiveresults_list +from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget + + +# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} +GLOBAL_CONTEXT = {} + + +@lru_cache(maxsize=1) +def _plugin_sort_order() -> dict[str, int]: + return {get_plugin_name(plugin): idx for idx, plugin in enumerate(get_plugins())} + + +@lru_cache(maxsize=256) +def _expected_snapshot_hook_total(config_json: str) -> int: + from archivebox.hooks import discover_hooks + + try: + config = json.loads(config_json) if config_json else {} + except Exception: + return 0 + + return len(discover_hooks("Snapshot", config=config)) + + +class SnapshotActionForm(ActionForm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Define tags field in __init__ to avoid database access during app initialization + self.fields["tags"] = forms.CharField( + label="", + required=False, + widget=TagEditorWidget(), + ) + + def clean_tags(self): + """Parse comma-separated tag names into Tag objects.""" + tags_str = self.cleaned_data.get("tags", "") + if not tags_str: + return [] + + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={"name": name}, + ) + # Use the existing tag if found by case-insensitive match + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + return tags + + # TODO: allow selecting actions for specific extractor plugins? is this useful? + # plugin = forms.ChoiceField( + # choices=ArchiveResult.PLUGIN_CHOICES, + # required=False, + # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) + # ) + + +class TagNameListFilter(admin.SimpleListFilter): + title = "By tag name" + parameter_name = "tag" + + def lookups(self, request, model_admin): + return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by("name")] + + def queryset(self, request, queryset): + if self.value(): + return queryset.filter(tags__id=self.value()) + return queryset + + +class SnapshotAdminForm(forms.ModelForm): + """Custom form for Snapshot admin with tag editor widget.""" + + tags_editor = forms.CharField( + label="Tags", + required=False, + widget=TagEditorWidget(), + help_text="Type tag names and press Enter or Space to add. Click × to remove.", + ) + + class Meta: + model = Snapshot + fields = "__all__" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Initialize tags_editor with current tags + if self.instance and self.instance.pk: + self.initial["tags_editor"] = ",".join( + sorted(tag.name for tag in self.instance.tags.all()), + ) + + def save(self, commit=True): + instance = super().save(commit=False) + + # Handle tags_editor field + if commit: + instance.save() + save_m2m = getattr(self, "_save_m2m", None) + if callable(save_m2m): + save_m2m() + + # Parse and save tags from tags_editor + tags_str = self.cleaned_data.get("tags_editor", "") + if tags_str: + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={"name": name}, + ) + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + instance.tags.set(tags) + else: + instance.tags.clear() + + return instance + + +class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): + form = SnapshotAdminForm + list_display = ("created_at", "preview_icon", "title_str", "tags_inline", "status_with_progress", "files", "size_with_stats") + sort_fields = ("title_str", "created_at", "status", "crawl") + readonly_fields = ( + "admin_actions", + "snapshot_summary", + "url_favicon", + "tags_badges", + "imported_timestamp", + "created_at", + "modified_at", + "downloaded_at", + "output_dir", + "archiveresults_list", + ) + search_fields = ("id", "url", "timestamp", "title", "tags__name") + list_filter = ("created_at", "downloaded_at", "archiveresult__status", "crawl__created_by", TagNameListFilter) + + fieldsets = ( + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "actions-card"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot_summary",), + "classes": ("card",), + }, + ), + ( + "URL", + { + "fields": (("url_favicon", "url"), ("title", "tags_badges")), + "classes": ("card", "wide"), + }, + ), + ( + "Tags", + { + "fields": ("tags_editor",), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("bookmarked_at", "created_at", "modified_at", "downloaded_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("crawl",), + "classes": ("card",), + }, + ), + ( + "Config", + { + "fields": ("config",), + "description": 'Uses Crawl.config by default. Only set per-snapshot overrides here when needed.', + "classes": ("card",), + }, + ), + ( + "Files", + { + "fields": ("output_dir",), + "classes": ("card",), + }, + ), + ( + "Archive Results", + { + "fields": ("archiveresults_list",), + "classes": ("card", "wide"), + }, + ), + ) + + ordering = ["-created_at"] + actions = ["add_tags", "remove_tags", "resnapshot_snapshot", "update_snapshots", "overwrite_snapshots", "delete_snapshots"] + inlines = [] # Removed TagInline, using TagEditorWidget instead + list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) + + action_form = SnapshotActionForm + paginator = AcceleratedPaginator + + save_on_top = True + show_full_result_count = False + + def changelist_view(self, request, extra_context=None): + self.request = request + extra_context = extra_context or {} + try: + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + except Exception as e: + self.message_user(request, f"Error occurred while loading the page: {str(e)} {request.GET} {request.POST}") + return super().changelist_view(request, GLOBAL_CONTEXT) + + def get_actions(self, request): + actions = super().get_actions(request) + if not actions: + return {} + actions.pop("delete_selected", None) + return actions + + def get_snapshot_view_url(self, obj: Snapshot) -> str: + return build_snapshot_url(str(obj.id), request=getattr(self, "request", None)) + + def get_snapshot_files_url(self, obj: Snapshot) -> str: + return f"{build_snapshot_url(str(obj.id), request=getattr(self, 'request', None))}/?files=1" + + def get_snapshot_zip_url(self, obj: Snapshot) -> str: + return f"{self.get_snapshot_files_url(obj)}&download=zip" + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path("grid/", self.admin_site.admin_view(self.grid_view), name="grid"), + path("/redo-failed/", self.admin_site.admin_view(self.redo_failed_view), name="core_snapshot_redo_failed"), + ] + return custom_urls + urls + + def redo_failed_view(self, request, object_id): + snapshot = get_object_or_404(Snapshot, pk=object_id) + + if request.method == "POST": + retried = snapshot.retry_failed_archiveresults() + if retried: + messages.success( + request, + f"Queued {retried} failed/skipped extractors for retry on this snapshot.", + ) + else: + messages.info( + request, + "No failed/skipped extractors were found on this snapshot.", + ) + + return redirect(snapshot.admin_change_url) + + # def get_queryset(self, request): + # # tags_qs = SnapshotTag.objects.all().select_related('tag') + # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) + + # self.request = request + # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) + def get_queryset(self, request): + self.request = request + ordering_fields = self._get_ordering_fields(request) + needs_size_sort = "size_with_stats" in ordering_fields + needs_files_sort = "files" in ordering_fields + needs_tags_sort = "tags_inline" in ordering_fields + is_change_view = getattr(getattr(request, "resolver_match", None), "url_name", "") == "core_snapshot_change" + + prefetch_qs = ArchiveResult.objects.only( + "id", + "snapshot_id", + "plugin", + "status", + "output_size", + "output_files", + "output_str", + ) + if not is_change_view: + prefetch_qs = prefetch_qs.filter(Q(status="succeeded")) + + qs = ( + super() + .get_queryset(request) + .select_related("crawl__created_by") + .defer("config", "notes") + .prefetch_related("tags") + .prefetch_related(Prefetch("archiveresult_set", queryset=prefetch_qs)) + ) + + if needs_size_sort: + qs = qs.annotate( + output_size_sum=Coalesce( + Sum("archiveresult__output_size"), + 0, + ), + ) + + if needs_files_sort: + qs = qs.annotate( + ar_succeeded_count=Count( + "archiveresult", + filter=Q(archiveresult__status="succeeded"), + ), + ) + if needs_tags_sort: + qs = qs.annotate(tag_count=Count("tags", distinct=True)) + + return qs + + @admin.display(description="Imported Timestamp") + def imported_timestamp(self, obj): + context = RequestContext( + self.request, + { + "bookmarked_date": obj.bookmarked_at, + "timestamp": obj.timestamp, + }, + ) + + html = Template("""{{bookmarked_date}} ({{timestamp}})""") + return mark_safe(html.render(context)) + + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') + # return f'{pretty_time} ({obj.timestamp})' + + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + #
+ # + # + # + # + # + # + # + #
+ # ''', + # csrf.get_token(self.request), + # obj.pk, + # ) + + @admin.display(description="") + def admin_actions(self, obj): + summary_url = self.get_snapshot_view_url(obj) + files_url = self.get_snapshot_files_url(obj) + zip_url = self.get_snapshot_zip_url(obj) + redo_failed_url = f"/admin/core/snapshot/{obj.pk}/redo-failed/" + return format_html( + """ + + """, + summary_url, + files_url, + zip_url, + obj.url, + obj.pk, + redo_failed_url, + obj.pk, + obj.pk, + ) + + def status_info(self, obj): + favicon_url = build_snapshot_url(str(obj.id), "favicon.ico") + return format_html( + """ + Archived: {} ({} files {})     + Favicon:     + Extension: {}     + """, + "✅" if obj.is_archived else "❌", + obj.num_outputs, + self.size(obj) or "0kb", + favicon_url, + obj.extension or "-", + ) + + @admin.display(description="Archive Results") + def archiveresults_list(self, obj): + return render_archiveresults_list(obj.archiveresult_set.all()) + + @admin.display( + description="Title", + ordering="title", + ) + def title_str(self, obj): + title_raw = (obj.title or "").strip() + url_raw = (obj.url or "").strip() + title_normalized = title_raw.lower() + url_normalized = url_raw.lower() + show_title = bool(title_raw) and title_normalized != "pending..." and title_normalized != url_normalized + css_class = "fetched" if show_title else "pending" + + detail_url = build_web_url(f"/{obj.archive_path_from_db}/index.html") + title_html = "" + if show_title: + title_html = format_html( + '{}', + detail_url, + css_class, + urldecode(htmldecode(title_raw))[:128], + ) + + return format_html( + "{}" + '
' + '{}' + "
", + title_html, + url_raw or obj.url, + (url_raw or obj.url)[:128], + ) + + @admin.display(description="Tags", ordering="tag_count") + def tags_inline(self, obj): + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk)) + tags = self._get_prefetched_tags(obj) + tags_html = widget.render( + name=f"tags_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_{obj.pk}"}, + snapshot_id=str(obj.pk), + ) + return mark_safe(f'{tags_html}') + + @admin.display(description="Tags") + def tags_badges(self, obj): + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk), editable=False) + tags = self._get_prefetched_tags(obj) + tags_html = widget.render( + name=f"tags_readonly_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_readonly_{obj.pk}"}, + snapshot_id=str(obj.pk), + ) + return mark_safe(f'{tags_html}') + + def _get_preview_data(self, obj): + results = self._get_prefetched_results(obj) + if results is not None: + has_screenshot = any(r.plugin == "screenshot" for r in results) + has_favicon = any(r.plugin == "favicon" for r in results) + else: + available_plugins = set(obj.archiveresult_set.filter(plugin__in=("screenshot", "favicon")).values_list("plugin", flat=True)) + has_screenshot = "screenshot" in available_plugins + has_favicon = "favicon" in available_plugins + + if not has_screenshot and not has_favicon: + return None + + if has_screenshot: + img_url = build_snapshot_url(str(obj.id), "screenshot/screenshot.png") + fallbacks = [ + build_snapshot_url(str(obj.id), "screenshot.png"), + build_snapshot_url(str(obj.id), "favicon/favicon.ico"), + build_snapshot_url(str(obj.id), "favicon.ico"), + ] + img_alt = "Screenshot" + preview_class = "screenshot" + else: + img_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico") + fallbacks = [ + build_snapshot_url(str(obj.id), "favicon.ico"), + ] + img_alt = "Favicon" + preview_class = "favicon" + + fallback_list = ",".join(fallbacks) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.remove()" + ) + + return { + "img_url": img_url, + "img_alt": img_alt, + "preview_class": preview_class, + "onerror_js": onerror_js, + "fallback_list": fallback_list, + } + + @admin.display(description="", empty_value="") + def url_favicon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return "" + + favicon_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico") + fallback_list = ",".join([build_snapshot_url(str(obj.id), "favicon.ico")]) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.closest('a') && this.closest('a').remove()" + ) + + return format_html( + '' + 'Favicon' + "", + favicon_url, + favicon_url, + onerror_js, + fallback_list, + ) + + @admin.display(description="Preview", empty_value="") + def preview_icon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return None + + return format_html( + '{}', + preview["img_url"], + preview["img_alt"], + preview["preview_class"], + preview["onerror_js"], + preview["fallback_list"], + ) + + @admin.display(description=" ", empty_value="") + def snapshot_summary(self, obj): + preview = self._get_preview_data(obj) + stats = self._get_progress_stats(obj) + archive_size = stats["output_size"] or 0 + size_txt = printable_filesize(archive_size) if archive_size else "pending" + screenshot_html = "" + + if preview: + screenshot_html = format_html( + '' + '{alt}' + "", + href=build_web_url(f"/{obj.archive_path}"), + src=preview["img_url"], + alt=preview["img_alt"], + onerror=preview["onerror_js"], + fallbacks=preview["fallback_list"], + ) + + return format_html( + '
' + "{}" + '
' + '
snap_dir size
' + '
{}
' + '
' + 'Open {} to inspect files.' + "
" + "
" + "
", + screenshot_html, + size_txt, + build_web_url(f"/{obj.archive_path}"), + obj.archive_path, + ) + + @admin.display( + description="Files Saved", + ordering="ar_succeeded_count", + ) + def files(self, obj): + results = self._get_prefetched_results(obj) + if results is None: + results = obj.archiveresult_set.only("plugin", "status", "output_files", "output_str") + + plugins_with_output: dict[str, ArchiveResult] = {} + for result in results: + if result.status != ArchiveResult.StatusChoices.SUCCEEDED: + continue + if not (result.output_files or str(result.output_str or "").strip()): + continue + plugins_with_output.setdefault(result.plugin, result) + + if not plugins_with_output: + return mark_safe('...') + + sorted_results = sorted( + plugins_with_output.values(), + key=lambda result: (_plugin_sort_order().get(result.plugin, 9999), result.plugin), + ) + output = [ + format_html( + '{}', + self._result_output_href(obj, result), + result.plugin, + get_plugin_icon(result.plugin), + ) + for result in sorted_results + ] + + return format_html( + '{}', + mark_safe("".join(output)), + ) + + @admin.display( + # ordering='archiveresult_count' + ) + def size(self, obj): + archive_size = self._get_progress_stats(obj)["output_size"] or 0 + if archive_size: + size_txt = printable_filesize(archive_size) + if archive_size > 52428800: + size_txt = mark_safe(f"{size_txt}") + else: + size_txt = mark_safe('...') + return format_html( + '{}', + build_web_url(f"/{obj.archive_path}"), + size_txt, + ) + + @admin.display( + description="Status", + ordering="status", + ) + def status_with_progress(self, obj): + """Show status with progress bar for in-progress snapshots.""" + stats = self._get_progress_stats(obj) + + # Status badge colors + status_colors = { + "queued": ("#f59e0b", "#fef3c7"), # amber + "started": ("#3b82f6", "#dbeafe"), # blue + "sealed": ("#10b981", "#d1fae5"), # green + "succeeded": ("#10b981", "#d1fae5"), # green + "failed": ("#ef4444", "#fee2e2"), # red + "backoff": ("#f59e0b", "#fef3c7"), # amber + "skipped": ("#6b7280", "#f3f4f6"), # gray + } + fg_color, bg_color = status_colors.get(obj.status, ("#6b7280", "#f3f4f6")) + + # For started snapshots, show progress bar + if obj.status == "started" and stats["total"] > 0: + percent = stats["percent"] + running = stats["running"] + succeeded = stats["succeeded"] + failed = stats["failed"] + + return format_html( + """
+
+ + {}/{} hooks +
+
+
+
+
+ ✓{} ✗{} âŗ{} +
+
""", + succeeded + failed + stats["skipped"], + stats["total"], + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, + percent, + succeeded, + failed, + running, + ) + + # For other statuses, show simple badge + return format_html( + '{}', + bg_color, + fg_color, + obj.status.upper(), + ) + + @admin.display( + description="Size", + ordering="output_size_sum", + ) + def size_with_stats(self, obj): + """Show archive size with output size from archive results.""" + stats = self._get_progress_stats(obj) + output_size = stats["output_size"] + size_bytes = output_size or 0 + zip_url = self.get_snapshot_zip_url(obj) + zip_link = format_html( + 'âŦ‡ ZIP', + zip_url, + ) + + if size_bytes: + size_txt = printable_filesize(size_bytes) + if size_bytes > 52428800: # 50MB + size_txt = mark_safe(f"{size_txt}") + else: + size_txt = mark_safe('...') + + # Show hook statistics + if stats["total"] > 0: + return format_html( + '' + "{}" + '
' + "{}/{} hooks
" + "{}", + build_web_url(f"/{obj.archive_path_from_db}"), + size_txt, + stats["succeeded"], + stats["total"], + zip_link, + ) + + return format_html( + '{}{}', + build_web_url(f"/{obj.archive_path_from_db}"), + size_txt, + zip_link, + ) + + def _get_progress_stats(self, obj): + results = self._get_prefetched_results(obj) + if results is None: + stats = obj.get_progress_stats() + expected_total = self._get_expected_hook_total(obj) + total = max(stats["total"], expected_total) + completed = stats["succeeded"] + stats["failed"] + stats.get("skipped", 0) + stats.get("noresults", 0) + stats["total"] = total + stats["pending"] = max(total - completed - stats["running"], 0) + stats["percent"] = int((completed / total * 100) if total > 0 else 0) + return stats + + expected_total = self._get_expected_hook_total(obj) + observed_total = len(results) + total = max(observed_total, expected_total) + succeeded = sum(1 for r in results if r.status == "succeeded") + failed = sum(1 for r in results if r.status == "failed") + running = sum(1 for r in results if r.status == "started") + skipped = sum(1 for r in results if r.status == "skipped") + noresults = sum(1 for r in results if r.status == "noresults") + pending = max(total - succeeded - failed - running - skipped - noresults, 0) + completed = succeeded + failed + skipped + noresults + percent = int((completed / total * 100) if total > 0 else 0) + is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED) + output_size = None + + if hasattr(obj, "output_size_sum"): + output_size = obj.output_size_sum or 0 + else: + output_size = sum(r.output_size or 0 for r in results) + + return { + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": output_size or 0, + "is_sealed": is_sealed, + } + + def _get_prefetched_results(self, obj): + if hasattr(obj, "_prefetched_objects_cache") and "archiveresult_set" in obj._prefetched_objects_cache: + return obj.archiveresult_set.all() + return None + + def _get_expected_hook_total(self, obj) -> int: + from archivebox.config.configset import get_config + + try: + config = get_config(crawl=obj.crawl, snapshot=obj) + config_json = json.dumps(config, sort_keys=True, default=str, separators=(",", ":")) + return _expected_snapshot_hook_total(config_json) + except Exception: + return 0 + + def _get_prefetched_tags(self, obj): + if hasattr(obj, "_prefetched_objects_cache") and "tags" in obj._prefetched_objects_cache: + return list(obj._prefetched_objects_cache["tags"]) + return None + + def _result_output_href(self, obj, result: ArchiveResult) -> str: + ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid", "cmd.sh"} + + for rel_path in result.output_file_paths(): + raw_path = str(rel_path or "").strip().lstrip("/") + if not raw_path: + continue + basename = raw_path.rsplit("/", 1)[-1] + if basename in ignored or raw_path.endswith((".pid", ".log", ".sh")): + continue + relative_path = raw_path if raw_path.startswith(f"{result.plugin}/") else f"{result.plugin}/{raw_path}" + return f"/{obj.archive_path_from_db}/{relative_path}" + + raw_output = str(result.output_str or "").strip().lstrip("/") + if raw_output and raw_output not in {".", "./"} and "://" not in raw_output and not raw_output.startswith("/"): + relative_path = raw_output if raw_output.startswith(f"{result.plugin}/") else f"{result.plugin}/{raw_output}" + return f"/{obj.archive_path_from_db}/{relative_path}" + + return f"/{obj.archive_path_from_db}/{result.plugin}/" + + def _get_ordering_fields(self, request): + ordering = request.GET.get("o") + if not ordering: + return set() + fields = set() + for part in ordering.split("."): + if not part: + continue + try: + idx = abs(int(part)) - 1 + except ValueError: + continue + if 0 <= idx < len(self.list_display): + fields.add(self.list_display[idx]) + return fields + + @admin.display( + description="Original URL", + ordering="url", + ) + def url_str(self, obj): + return format_html( + '{}', + obj.url, + obj.url[:128], + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('{}', color, h) + + def grid_view(self, request, extra_context=None): + + # cl = self.get_changelist_instance(request) + + # Save before monkey patching to restore for changelist list view + admin_cls = type(self) + saved_change_list_template = admin_cls.change_list_template + saved_list_per_page = admin_cls.list_per_page + saved_list_max_show_all = admin_cls.list_max_show_all + + # Monkey patch here plus core_tags.py + admin_cls.change_list_template = "private_index_grid.html" + admin_cls.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + admin_cls.list_max_show_all = admin_cls.list_per_page + + # Call monkey patched view + rendered_response = self.changelist_view(request, extra_context=extra_context) + + # Restore values + admin_cls.change_list_template = saved_change_list_template + admin_cls.list_per_page = saved_list_per_page + admin_cls.list_max_show_all = saved_list_max_show_all + + return rendered_response + + # for debugging, uncomment this to print all requests: + # def changelist_view(self, request, extra_context=None): + # print('[*] Got request', request.method, request.POST) + # return super().changelist_view(request, extra_context=None) + + @admin.action( + description="🔁 Redo Failed", + ) + def update_snapshots(self, request, queryset): + queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR}) + + messages.success( + request, + f"Queued {queued} snapshots for re-archiving. The background runner will process them.", + ) + + @admin.action( + description="🆕 Archive Now", + ) + def resnapshot_snapshot(self, request, queryset): + snapshots = list(queryset) + if not snapshots: + messages.info(request, "No snapshots selected.") + return + + urls = "\n".join(snapshot.url for snapshot in snapshots if snapshot.url) + if not urls: + messages.info(request, "No valid snapshot URLs were found to archive.") + return + + bg_add({"urls": urls}) + + messages.success( + request, + f"Creating 1 new crawl with {len(snapshots)} fresh snapshots. The background runner will process them.", + ) + + @admin.action( + description="🔄 Redo", + ) + def overwrite_snapshots(self, request, queryset): + queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR}) + + messages.success( + request, + f"Queued {queued} snapshots for full re-archive (overwriting existing). The background runner will process them.", + ) + + @admin.action( + description="đŸ—‘ī¸ Delete", + ) + def delete_snapshots(self, request, queryset): + """Delete snapshots in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list("pk", flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete() + + messages.success( + request, + mark_safe( + f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed.", + ), + ) + + @admin.action( + description="+", + ) + def add_tags(self, request, queryset): + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get("tags", "") + if not tags_str: + messages.warning(request, "No tags specified.") + return + + # Parse comma-separated tag names and get/create Tag objects + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={"name": name}, + ) + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list("id", flat=True)) + num_snapshots = len(snapshot_ids) + + print("[+] Adding tags", [t.name for t in tags], "to", num_snapshots, "Snapshots") + + # Bulk create M2M relationships (1 query per tag, not per snapshot) + for tag in tags: + SnapshotTag.objects.bulk_create( + [SnapshotTag(snapshot_id=sid, tag=tag) for sid in snapshot_ids], + ignore_conflicts=True, # Skip if relationship already exists + ) + + messages.success( + request, + f"Added {len(tags)} tag(s) to {num_snapshots} Snapshot(s).", + ) + + @admin.action( + description="–", + ) + def remove_tags(self, request, queryset): + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get("tags", "") + if not tags_str: + messages.warning(request, "No tags specified.") + return + + # Parse comma-separated tag names and find matching Tag objects (case-insensitive) + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag = Tag.objects.filter(name__iexact=name).first() + if tag: + tags.append(tag) + + if not tags: + messages.warning(request, "No matching tags found.") + return + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list("id", flat=True)) + num_snapshots = len(snapshot_ids) + tag_ids = [t.pk for t in tags] + + print("[-] Removing tags", [t.name for t in tags], "from", num_snapshots, "Snapshots") + + # Bulk delete M2M relationships (1 query total, not per snapshot) + deleted_count, _ = SnapshotTag.objects.filter( + snapshot_id__in=snapshot_ids, + tag_id__in=tag_ids, + ).delete() + + messages.success( + request, + f"Removed {len(tags)} tag(s) from {num_snapshots} Snapshot(s) ({deleted_count} associations deleted).", + ) diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py new file mode 100644 index 0000000000..dfa456bba6 --- /dev/null +++ b/archivebox/core/admin_tags.py @@ -0,0 +1,235 @@ +__package__ = "archivebox.core" + +from urllib.parse import quote + +from django import forms +from django.contrib import admin, messages +from django.contrib.admin.options import IS_POPUP_VAR +from django.http import HttpRequest, HttpResponseRedirect +from django.urls import reverse +from django.utils.html import format_html +from django.utils.safestring import mark_safe + +from archivebox.base_models.admin import BaseModelAdmin +from archivebox.core.models import SnapshotTag, Tag +from archivebox.core.tag_utils import ( + TAG_HAS_SNAPSHOTS_CHOICES, + TAG_SORT_CHOICES, + build_tag_cards, + get_tag_creator_choices, + get_tag_year_choices, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, +) +from archivebox.core.host_utils import build_snapshot_url + + +class TagInline(admin.TabularInline): + model = SnapshotTag + fields = ("id", "tag") + extra = 1 + max_num = 1000 + autocomplete_fields = ("tag",) + + +class TagAdminForm(forms.ModelForm): + class Meta: + model = Tag + fields = "__all__" + widgets = { + "name": forms.TextInput( + attrs={ + "placeholder": "research, receipts, product-design...", + "autocomplete": "off", + "spellcheck": "false", + "data-tag-name-input": "1", + }, + ), + } + + def clean_name(self): + name = (self.cleaned_data.get("name") or "").strip() + if not name: + raise forms.ValidationError("Tag name is required.") + return name + + +class TagAdmin(BaseModelAdmin): + form = TagAdminForm + change_list_template = "admin/core/tag/change_list.html" + change_form_template = "admin/core/tag/change_form.html" + list_display = ("name", "num_snapshots", "created_at", "created_by") + list_filter = ("created_at", "created_by") + search_fields = ("id", "name", "slug") + readonly_fields = ("slug", "id", "created_at", "modified_at", "snapshots") + actions = ["delete_selected"] + ordering = ["name", "id"] + + fieldsets = ( + ( + "Tag", + { + "fields": ("name", "slug"), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("id", "created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Recent Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), + ) + + add_fieldsets = ( + ( + "Tag", + { + "fields": ("name",), + "classes": ("card", "wide"), + }, + ), + ( + "Metadata", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), + ) + + def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None): + return self.fieldsets if obj else self.add_fieldsets + + def changelist_view(self, request: HttpRequest, extra_context=None): + query = (request.GET.get("q") or "").strip() + sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip()) + created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip()) + year = normalize_created_year_filter((request.GET.get("year") or "").strip()) + has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip()) + extra_context = { + **(extra_context or {}), + "initial_query": query, + "initial_sort": sort, + "initial_created_by": created_by, + "initial_year": year, + "initial_has_snapshots": has_snapshots, + "tag_sort_choices": TAG_SORT_CHOICES, + "tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES, + "tag_created_by_choices": get_tag_creator_choices(), + "tag_year_choices": get_tag_year_choices(), + "initial_tag_cards": build_tag_cards( + query=query, + request=request, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ), + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_create_api_url": reverse("api-1:tags_create"), + } + return super().changelist_view(request, extra_context=extra_context) + + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + current_name = (request.POST.get("name") or "").strip() + if not current_name and obj: + current_name = obj.name + + similar_tag_cards = ( + build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12) + ) + if obj: + similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk] + + context.update( + { + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_similar_cards": similar_tag_cards, + "tag_similar_query": current_name, + }, + ) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None): + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST: + return super().response_add(request, obj, post_url_continue=post_url_continue) + + self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def response_change(self, request: HttpRequest, obj: Tag): + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST: + return super().response_change(request, obj) + + self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect: + changelist_url = reverse("admin:core_tag_changelist") + if query: + changelist_url = f"{changelist_url}?q={quote(query)}" + return HttpResponseRedirect(changelist_url) + + @admin.display(description="Snapshots") + def snapshots(self, tag: Tag): + snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10] + total_count = tag.snapshot_set.count() + if not snapshots: + return mark_safe( + f'

No snapshots use this tag yet. ' + f'Open filtered snapshot list.

', + ) + + cards = [] + for snapshot in snapshots: + title = (snapshot.title or "").strip() or snapshot.url + cards.append( + format_html( + """ + + + + {} + {} + + + """, + reverse("admin:core_snapshot_change", args=[snapshot.pk]), + build_snapshot_url(str(snapshot.pk), "favicon.ico"), + title[:120], + snapshot.url[:120], + ), + ) + + cards.append( + format_html( + 'View all {} tagged snapshots', + tag.id, + total_count, + ), + ) + return mark_safe('
' + "".join(cards) + "
") + + @admin.display(description="Snapshots", ordering="num_snapshots") + def num_snapshots(self, tag: Tag): + count = getattr(tag, "num_snapshots", tag.snapshot_set.count()) + return format_html( + '{} total', + tag.id, + count, + ) + + +def register_admin(admin_site): + admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/admin_users.py b/archivebox/core/admin_users.py new file mode 100644 index 0000000000..7a38271b74 --- /dev/null +++ b/archivebox/core/admin_users.py @@ -0,0 +1,108 @@ +__package__ = "archivebox.core" + +from django.contrib import admin +from django.contrib.auth.admin import UserAdmin +from django.contrib.auth import get_user_model +from django.utils.html import format_html +from django.utils.safestring import mark_safe + + +class CustomUserAdmin(UserAdmin): + sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"] + list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"] + readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set") + + # Preserve Django's default user creation form and fieldsets + # This ensures passwords are properly hashed and permissions are set correctly + add_fieldsets = UserAdmin.add_fieldsets + + # Extend fieldsets for change form only (not user creation) + fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})] + + @admin.display(description="Snapshots") + def snapshot_set(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe( + "
".join( + format_html( + '[{}] 📅 {} {}', + snap.pk, + str(snap.id)[:8], + snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...", + snap.url[:64], + ) + for snap in obj.snapshot_set.order_by("-modified_at")[:10] + ) + + f'
{total_count} total records...', + ) + + @admin.display(description="Archive Result Logs") + def archiveresult_set(self, obj): + total_count = obj.archiveresult_set.count() + return mark_safe( + "
".join( + format_html( + '
[{}] 📅 {} 📄 {} {}', + result.pk, + str(result.id)[:8], + result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...", + result.extractor, + result.snapshot.url[:64], + ) + for result in obj.archiveresult_set.order_by("-modified_at")[:10] + ) + + f'
{total_count} total records...', + ) + + @admin.display(description="Tags") + def tag_set(self, obj): + total_count = obj.tag_set.count() + return mark_safe( + ", ".join( + format_html( + '{}', + tag.pk, + tag.name, + ) + for tag in obj.tag_set.order_by("-modified_at")[:10] + ) + + f'
{total_count} total records...', + ) + + @admin.display(description="API Tokens") + def apitoken_set(self, obj): + total_count = obj.apitoken_set.count() + return mark_safe( + "
".join( + format_html( + '
[{}] {} (expires {})', + apitoken.pk, + str(apitoken.id)[:8], + apitoken.token_redacted[:64], + apitoken.expires, + ) + for apitoken in obj.apitoken_set.order_by("-modified_at")[:10] + ) + + f'
{total_count} total records...', + ) + + @admin.display(description="API Outbound Webhooks") + def outboundwebhook_set(self, obj): + total_count = obj.outboundwebhook_set.count() + return mark_safe( + "
".join( + format_html( + '
[{}] {} -> {}', + outboundwebhook.pk, + str(outboundwebhook.id)[:8], + outboundwebhook.referenced_model, + outboundwebhook.endpoint, + ) + for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10] + ) + + f'
{total_count} total records...', + ) + + +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 5182da0506..b173ae90d4 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -1,6 +1,49 @@ +__package__ = "archivebox.core" + from django.apps import AppConfig +import os class CoreConfig(AppConfig): - name = 'core' - default_auto_field = 'django.db.models.UUIDField' + name = "archivebox.core" + label = "core" + + def ready(self): + """Register the archivebox.core.admin_site as the main django admin site""" + import sys + from django.utils.autoreload import DJANGO_AUTORELOAD_ENV + + from archivebox.core.admin_site import register_admin_site + + register_admin_site() + + # Import models to register state machines with the registry + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.core import models # noqa: F401 + + pidfile = os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE") + if pidfile: + should_write_pid = True + if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1": + should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == "true" + if should_write_pid: + try: + with open(pidfile, "w") as handle: + handle.write(str(os.getpid())) + except Exception: + pass + + def _should_prepare_runtime() -> bool: + if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1": + if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1": + return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true" + return True + return False + + if _should_prepare_runtime(): + from archivebox.machine.models import Process, Machine + + Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() + Machine.current() diff --git a/archivebox/core/asgi.py b/archivebox/core/asgi.py new file mode 100644 index 0000000000..1253fbb049 --- /dev/null +++ b/archivebox/core/asgi.py @@ -0,0 +1,29 @@ +""" +ASGI config for archivebox project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/stable/howto/deployment/asgi/ +""" + +from archivebox.config.django import setup_django +from django.core.asgi import get_asgi_application + +setup_django(in_memory_db=False, check_db=True) + +# Standard Django ASGI application (no websockets/channels needed) +application = get_asgi_application() + +# If websocket support is needed later, install channels and use: +# from channels.routing import ProtocolTypeRouter, URLRouter +# from channels.auth import AuthMiddlewareStack +# from channels.security.websocket import AllowedHostsOriginValidator +# from archivebox.core.routing import websocket_urlpatterns +# +# application = ProtocolTypeRouter({ +# "http": get_asgi_application(), +# "websocket": AllowedHostsOriginValidator( +# AuthMiddlewareStack(URLRouter(websocket_urlpatterns)) +# ), +# }) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 99f4d02eba..edc0403d06 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -1,68 +1,343 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" from django import forms +from django.utils.html import format_html -from ..util import URL_REGEX -from ..parsers import PARSERS -from ..vendor.taggit_utils import edit_string_for_tags, parse_tags +from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes +from taggit.utils import edit_string_for_tags, parse_tags +from archivebox.base_models.admin import KeyValueWidget +from archivebox.crawls.schedule_utils import validate_schedule +from archivebox.config.common import SEARCH_BACKEND_CONFIG +from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget +from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon +from archivebox.personas.models import Persona -PARSER_CHOICES = [ - (parser_key, parser[0]) - for parser_key, parser in PARSERS.items() -] DEPTH_CHOICES = ( - ('0', 'depth = 0 (archive just these URLs)'), - ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'), + ("0", "depth = 0 (archive just these URLs)"), + ("1", "depth = 1 (+ URLs one hop away)"), + ("2", "depth = 2 (+ URLs two hops away)"), + ("3", "depth = 3 (+ URLs three hops away)"), + ("4", "depth = 4 (+ URLs four hops away)"), ) -from ..extractors import get_default_archive_methods -ARCHIVE_METHODS = [ - (name, name) - for name, _, _ in get_default_archive_methods() -] +def get_plugin_choices(): + """Get available extractor plugins from discovered hooks.""" + return [(name, name) for name in get_plugins()] + + +def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str: + schema = plugin_configs.get(plugin_name, {}) + description = str(schema.get("description") or "").strip() + if not description: + return plugin_name + icon_html = get_plugin_icon(plugin_name) + + return format_html( + '{}{}{}', + icon_html, + plugin_name, + plugin_name, + description, + ) + + +def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField: + field = form.fields[name] + if not isinstance(field, forms.ChoiceField): + raise TypeError(f"{name} must be a ChoiceField") + return field class AddLinkForm(forms.Form): - url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) - parser = forms.ChoiceField(label="URLs format", choices=[('auto', 'Auto-detect parser'), *PARSER_CHOICES], initial='auto') - tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False) - depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"})) - archive_methods = forms.MultipleChoiceField( - label="Archive methods (select at least 1, otherwise all will be used by default)", + # Basic fields + url = forms.CharField( + label="URLs", + strip=True, + widget=forms.Textarea( + attrs={ + "data-url-regex": URL_REGEX.pattern, + }, + ), + required=True, + ) + tag = forms.CharField( + label="Tags", + strip=True, + required=False, + widget=TagEditorWidget(), + ) + depth = forms.ChoiceField( + label="Archive depth", + choices=DEPTH_CHOICES, + initial="0", + widget=forms.RadioSelect(attrs={"class": "depth-selection"}), + ) + max_urls = forms.IntegerField( + label="Max URLs", + required=False, + min_value=0, + initial=0, + widget=forms.NumberInput( + attrs={ + "min": 0, + "step": 1, + "placeholder": "0 = unlimited", + }, + ), + ) + max_size = forms.CharField( + label="Max size", + required=False, + initial="0", + widget=forms.TextInput( + attrs={ + "placeholder": "0 = unlimited, or e.g. 45mb / 1gb", + }, + ), + ) + notes = forms.CharField( + label="Notes", + strip=True, + required=False, + widget=forms.TextInput( + attrs={ + "placeholder": "Optional notes about this crawl", + }, + ), + ) + url_filters = forms.Field( + label="URL allowlist / denylist", required=False, - widget=forms.SelectMultiple, - choices=ARCHIVE_METHODS, - ) - # TODO: hook these up to the view and put them - # in a collapsible UI section labeled "Advanced" - # - # exclude_patterns = forms.CharField( - # label="Exclude patterns", - # min_length='1', - # required=False, - # initial=URL_BLACKLIST, - # ) - # timeout = forms.IntegerField( - # initial=TIMEOUT, - # ) - # overwrite = forms.BooleanField( - # label="Overwrite any existing Snapshots", - # initial=False, - # ) - # index_only = forms.BooleanField( - # label="Add URLs to index without Snapshotting", - # initial=False, - # ) - -class TagWidgetMixin: + widget=URLFiltersWidget(source_selector='textarea[name="url"]'), + ) + + # Plugin groups + chrome_plugins = forms.MultipleChoiceField( + label="Chrome-dependent plugins", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], # populated in __init__ + ) + archiving_plugins = forms.MultipleChoiceField( + label="Archiving", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + parsing_plugins = forms.MultipleChoiceField( + label="Parsing", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + search_plugins = forms.MultipleChoiceField( + label="Search", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + binary_plugins = forms.MultipleChoiceField( + label="Binary providers", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + extension_plugins = forms.MultipleChoiceField( + label="Browser extensions", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + + # Advanced options + schedule = forms.CharField( + label="Repeat schedule", + max_length=64, + required=False, + widget=forms.TextInput( + attrs={ + "placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)", + }, + ), + ) + persona = forms.ModelChoiceField( + label="Persona (authentication profile)", + required=False, + queryset=Persona.objects.none(), + empty_label=None, + to_field_name="name", + ) + index_only = forms.BooleanField( + label="Index only dry run (add crawl but don't archive yet)", + initial=False, + required=False, + ) + config = forms.JSONField( + label="Custom config overrides", + widget=KeyValueWidget(), + initial=dict, + required=False, + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + default_persona = Persona.get_or_create_default() + self.fields["persona"].queryset = Persona.objects.order_by("name") + self.fields["persona"].initial = default_persona.name + + # Get all plugins + all_plugins = get_plugins() + plugin_configs = discover_plugin_configs() + + # Define plugin groups + chrome_dependent = { + "accessibility", + "chrome", + "consolelog", + "dom", + "headers", + "parse_dom_outlinks", + "pdf", + "redirects", + "responses", + "screenshot", + "seo", + "singlefile", + "ssl", + "staticfile", + "title", + } + archiving = { + "archivedotorg", + "defuddle", + "favicon", + "forumdl", + "gallerydl", + "git", + "htmltotext", + "mercury", + "papersdl", + "readability", + "trafilatura", + "wget", + "ytdlp", + } + parsing = { + "parse_html_urls", + "parse_jsonl_urls", + "parse_netscape_urls", + "parse_rss_urls", + "parse_txt_urls", + } + search = { + "search_backend_ripgrep", + "search_backend_sonic", + "search_backend_sqlite", + } + binary = {"apt", "brew", "custom", "env", "npm", "pip"} + extensions = {"twocaptcha", "istilldontcareaboutcookies", "ublock"} + + # Populate plugin field choices + get_choice_field(self, "chrome_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent + ] + get_choice_field(self, "archiving_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving + ] + get_choice_field(self, "parsing_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing + ] + get_choice_field(self, "search_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search + ] + get_choice_field(self, "binary_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary + ] + get_choice_field(self, "extension_plugins").choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions + ] + + required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip() + search_choices = [choice[0] for choice in get_choice_field(self, "search_plugins").choices] + if required_search_plugin in search_choices: + get_choice_field(self, "search_plugins").initial = [required_search_plugin] + + def clean(self): + cleaned_data = super().clean() or {} + + # Combine all plugin groups into single list + all_selected_plugins = [] + for field in [ + "chrome_plugins", + "archiving_plugins", + "parsing_plugins", + "search_plugins", + "binary_plugins", + "extension_plugins", + ]: + selected = cleaned_data.get(field) + if isinstance(selected, list): + all_selected_plugins.extend(selected) + + # Store combined list for easy access + cleaned_data["plugins"] = all_selected_plugins + + return cleaned_data + + def clean_url(self): + value = self.cleaned_data.get("url") or "" + urls = "\n".join(find_all_urls(value)) + if not urls: + raise forms.ValidationError("Enter at least one valid URL.") + return urls + + def clean_url_filters(self): + from archivebox.crawls.models import Crawl + + value = self.cleaned_data.get("url_filters") or {} + return { + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), + } + + def clean_max_urls(self): + value = self.cleaned_data.get("max_urls") + return int(value or 0) + + def clean_max_size(self): + raw_value = str(self.cleaned_data.get("max_size") or "").strip() + if not raw_value: + return 0 + try: + value = parse_filesize_to_bytes(raw_value) + except ValueError as err: + raise forms.ValidationError(str(err)) + if value < 0: + raise forms.ValidationError("Max size must be 0 or a positive number of bytes.") + return value + + def clean_schedule(self): + schedule = (self.cleaned_data.get("schedule") or "").strip() + if not schedule: + return "" + + try: + validate_schedule(schedule) + except ValueError as err: + raise forms.ValidationError(str(err)) + + return schedule + + +class TagWidget(forms.TextInput): def format_value(self, value): if value is not None and not isinstance(value, str): value = edit_string_for_tags(value) return super().format_value(value) -class TagWidget(TagWidgetMixin, forms.TextInput): - pass class TagField(forms.CharField): widget = TagWidget @@ -73,24 +348,24 @@ def clean(self, value): return parse_tags(value) except ValueError: raise forms.ValidationError( - "Please provide a comma-separated list of tags." + "Please provide a comma-separated list of tags.", ) - def has_changed(self, initial_value, data_value): + def has_changed(self, initial, data): # Always return False if the field is disabled since self.bound_data # always uses the initial value in this case. if self.disabled: return False try: - data_value = self.clean(data_value) + cleaned_data = self.clean(data) except forms.ValidationError: - pass + cleaned_data = data - if initial_value is None: - initial_value = [] + initial_value = [] if initial is None else initial - initial_value = [tag.name for tag in initial_value] - initial_value.sort() + if not isinstance(initial_value, list): + initial_value = list(initial_value) - return initial_value != data_value + normalized_initial = sorted(tag.name for tag in initial_value) + return normalized_initial != cleaned_data diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py new file mode 100644 index 0000000000..214797886e --- /dev/null +++ b/archivebox/core/host_utils.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import re +from urllib.parse import urlparse + +from archivebox.config.common import SERVER_CONFIG + + +_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$") +_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P[0-9a-fA-F]{12})$") + + +def split_host_port(host: str) -> tuple[str, str | None]: + parsed = urlparse(f"//{host}") + hostname = (parsed.hostname or host or "").lower() + port = str(parsed.port) if parsed.port else None + return hostname, port + + +def _normalize_base_url(value: str | None) -> str: + if not value: + return "" + base = value.strip() + if not base: + return "" + if "://" not in base: + base = f"http://{base}" + parsed = urlparse(base) + if not parsed.netloc: + return "" + return f"{parsed.scheme}://{parsed.netloc}" + + +def normalize_base_url(value: str | None) -> str: + return _normalize_base_url(value) + + +def get_listen_host() -> str: + return (SERVER_CONFIG.LISTEN_HOST or "").strip() + + +def get_listen_parts() -> tuple[str, str | None]: + return split_host_port(get_listen_host()) + + +def _build_listen_host(subdomain: str | None) -> str: + host, port = get_listen_parts() + if not host: + return "" + full_host = f"{subdomain}.{host}" if subdomain else host + if port: + return f"{full_host}:{port}" + return full_host + + +def get_admin_host() -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL) + if override: + return urlparse(override).netloc.lower() + return _build_listen_host("admin") + + +def get_web_host() -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL) + if override: + return urlparse(override).netloc.lower() + return _build_listen_host("web") + + +def get_api_host() -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + return _build_listen_host("api") + + +def get_public_host() -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + return _build_listen_host("public") + + +def get_snapshot_subdomain(snapshot_id: str) -> str: + normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "") + suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower() + return f"snap-{suffix}" + + +def get_snapshot_host(snapshot_id: str) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + return _build_listen_host(get_snapshot_subdomain(snapshot_id)) + + +def get_original_host(domain: str) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return get_listen_host().lower() + return _build_listen_host(domain) + + +def is_snapshot_subdomain(subdomain: str) -> bool: + value = (subdomain or "").strip() + return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value)) + + +def get_snapshot_lookup_key(snapshot_ref: str) -> str: + value = (snapshot_ref or "").strip().lower() + match = _SNAPSHOT_SUBDOMAIN_RE.match(value) + if match: + return match.group("suffix") + return value + + +def get_listen_subdomain(request_host: str) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return "" + req_host, req_port = split_host_port(request_host) + listen_host, listen_port = get_listen_parts() + if not listen_host: + return "" + if listen_port and req_port and listen_port != req_port: + return "" + if req_host == listen_host: + return "" + suffix = f".{listen_host}" + if req_host.endswith(suffix): + return req_host[: -len(suffix)] + return "" + + +def host_matches(request_host: str, target_host: str) -> bool: + if not request_host or not target_host: + return False + req_host, req_port = split_host_port(request_host) + target_host_only, target_port = split_host_port(target_host) + if req_host != target_host_only: + return False + if target_port and req_port and target_port != req_port: + return False + return True + + +def _scheme_from_request(request=None) -> str: + if request: + return request.scheme + return "http" + + +def _build_base_url_for_host(host: str, request=None) -> str: + if not host: + return "" + scheme = _scheme_from_request(request) + return f"{scheme}://{host}" + + +def get_admin_base_url(request=None) -> str: + override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL) + if override: + return override + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_base_url_for_host(get_listen_host(), request=request) + return _build_base_url_for_host(get_admin_host(), request=request) + + +def get_web_base_url(request=None) -> str: + override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL) + if override: + return override + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_base_url_for_host(get_listen_host(), request=request) + return _build_base_url_for_host(get_web_host(), request=request) + + +def get_api_base_url(request=None) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_base_url_for_host(get_listen_host(), request=request) + return _build_base_url_for_host(get_api_host(), request=request) + + +def get_public_base_url(request=None) -> str: + return _build_base_url_for_host(get_public_host(), request=request) + + +# Backwards-compat aliases (archive == web) +def get_archive_base_url(request=None) -> str: + return get_web_base_url(request=request) + + +def get_snapshot_base_url(snapshot_id: str, request=None) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_url(get_web_base_url(request=request), f"/snapshot/{snapshot_id}") + return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request) + + +def get_original_base_url(domain: str, request=None) -> str: + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + return _build_url(get_web_base_url(request=request), f"/original/{domain}") + return _build_base_url_for_host(get_original_host(domain), request=request) + + +def build_admin_url(path: str = "", request=None) -> str: + return _build_url(get_admin_base_url(request), path) + + +def build_web_url(path: str = "", request=None) -> str: + return _build_url(get_web_base_url(request), path) + + +def build_api_url(path: str = "", request=None) -> str: + return _build_url(get_api_base_url(request), path) + + +def build_archive_url(path: str = "", request=None) -> str: + return _build_url(get_archive_base_url(request), path) + + +def build_snapshot_url(snapshot_id: str, path: str = "", request=None) -> str: + return _build_url(get_snapshot_base_url(snapshot_id, request=request), path) + + +def build_original_url(domain: str, path: str = "", request=None) -> str: + return _build_url(get_original_base_url(domain, request=request), path) + + +def _build_url(base_url: str, path: str) -> str: + if not base_url: + if not path: + return "" + return path if path.startswith("/") else f"/{path}" + if not path: + return base_url + return f"{base_url}{path if path.startswith('/') else f'/{path}'}" diff --git a/archivebox/core/management/commands/archivebox.py b/archivebox/core/management/commands/archivebox.py index a68b5d94a5..4e663fe862 100644 --- a/archivebox/core/management/commands/archivebox.py +++ b/archivebox/core/management/commands/archivebox.py @@ -1,18 +1,17 @@ -__package__ = 'archivebox' +__package__ = "archivebox" from django.core.management.base import BaseCommand - -from .cli import run_subcommand +from archivebox.cli import main as run_cli class Command(BaseCommand): - help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)' + help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)" def add_arguments(self, parser): - parser.add_argument('subcommand', type=str, help='The subcommand you want to run') - parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand') - + parser.add_argument("subcommand", type=str, help="The subcommand you want to run") + parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand") def handle(self, *args, **kwargs): - run_subcommand(kwargs['subcommand'], args=kwargs['command_args']) + command_args = [kwargs["subcommand"], *kwargs["command_args"]] + run_cli(args=command_args) diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index 3b5787c400..3dddcbc6ef 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -1,14 +1,41 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" +import ipaddress +import re +from pathlib import Path from django.utils import timezone +from django.contrib.auth.middleware import RemoteUserMiddleware +from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.shortcuts import redirect +from django.contrib.staticfiles import finders +from django.utils.http import http_date +from django.http import HttpResponseForbidden, HttpResponseNotModified -from ..config import PUBLIC_SNAPSHOTS +from archivebox.config.common import SERVER_CONFIG +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH +from archivebox.core.host_utils import ( + build_snapshot_url, + build_admin_url, + build_web_url, + get_api_host, + get_admin_host, + get_listen_host, + get_listen_subdomain, + get_public_host, + get_web_host, + host_matches, + is_snapshot_subdomain, + split_host_port, +) +from archivebox.core.views import SnapshotHostView, OriginalDomainHostView -def detect_timezone(request, activate: bool=True): - gmt_offset = (request.COOKIES.get('GMT_OFFSET') or '').strip() +def detect_timezone(request, activate: bool = True): + gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip() tz = None - if gmt_offset.replace('-', '').isdigit(): + if gmt_offset.replace("-", "").isdigit(): tz = timezone.get_fixed_timezone(int(gmt_offset)) if activate: timezone.activate(tz) @@ -25,13 +52,196 @@ def middleware(request): def CacheControlMiddleware(get_response): + snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/") + static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip() + def middleware(request): response = get_response(request) - if '/archive/' in request.path or '/static/' in request.path: - policy = 'public' if PUBLIC_SNAPSHOTS else 'private' - response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300' - # print('Set Cache-Control header to', response['Cache-Control']) + if request.path.startswith("/static/"): + rel_path = request.path[len("/static/") :] + static_path = finders.find(rel_path) + if static_path: + try: + mtime = Path(static_path).stat().st_mtime + except OSError: + mtime = None + etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"' + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime: + not_modified.headers["Last-Modified"] = http_date(mtime) + return not_modified + response.headers["ETag"] = etag + response.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime and not response.headers.get("Last-Modified"): + response.headers["Last-Modified"] = http_date(mtime) + return response + + if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path): + if not response.get("Cache-Control"): + policy = "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private" + response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300" + # print('Set Cache-Control header to', response['Cache-Control']) return response return middleware + + +def ServerSecurityModeMiddleware(get_response): + blocked_prefixes = ("/admin", "/accounts", "/api", "/add", "/web") + allowed_methods = {"GET", "HEAD", "OPTIONS"} + + def middleware(request): + if SERVER_CONFIG.CONTROL_PLANE_ENABLED: + return get_response(request) + + request.user = AnonymousUser() + request._cached_user = request.user + + if request.method.upper() not in allowed_methods: + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") + + for prefix in blocked_prefixes: + if request.path == prefix or request.path.startswith(f"{prefix}/"): + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") + + return get_response(request) + + return middleware + + +def HostRoutingMiddleware(get_response): + snapshot_path_re = re.compile( + r"^/(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?$", + ) + + def middleware(request): + request_host = (request.get_host() or "").lower() + admin_host = get_admin_host() + web_host = get_web_host() + api_host = get_api_host() + public_host = get_public_host() + listen_host = get_listen_host() + subdomain = get_listen_subdomain(request_host) + + # Framework-owned assets must bypass snapshot/original-domain replay routing. + # Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests. + if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}: + return get_response(request) + + if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and not host_matches(request_host, admin_host): + if ( + request.path == "/admin" + or request.path.startswith("/admin/") + or request.path == "/accounts" + or request.path.startswith("/accounts/") + ): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING: + if host_matches(request_host, listen_host): + return get_response(request) + + req_host, req_port = split_host_port(request_host) + listen_host_only, listen_port = split_host_port(listen_host) + if req_host.endswith(f".{listen_host_only}"): + if not listen_port or not req_port or listen_port == req_port: + target = build_web_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return get_response(request) + + if host_matches(request_host, admin_host): + snapshot_match = snapshot_path_re.match(request.path) + if SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and snapshot_match: + snapshot_id = snapshot_match.group("snapshot_id") + replay_path = (snapshot_match.group("path") or "").strip("/") + if replay_path == "index.html": + replay_path = "" + target = build_snapshot_url(snapshot_id, replay_path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + return get_response(request) + + if host_matches(request_host, api_host): + request.user = AnonymousUser() + request._cached_user = request.user + if request.path.startswith("/admin"): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + if not request.path.startswith("/api/"): + target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}" + if request.META.get("QUERY_STRING"): + target_path = f"{target_path}?{request.META['QUERY_STRING']}" + return redirect(target_path) + return get_response(request) + + if host_matches(request_host, web_host): + request.user = AnonymousUser() + request._cached_user = request.user + return get_response(request) + + if host_matches(request_host, public_host): + return get_response(request) + + if subdomain: + if is_snapshot_subdomain(subdomain): + view = SnapshotHostView.as_view() + return view(request, snapshot_id=subdomain, path=request.path.lstrip("/")) + view = OriginalDomainHostView.as_view() + return view(request, domain=subdomain, path=request.path.lstrip("/")) + + if host_matches(request_host, listen_host): + target = build_web_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + if admin_host or web_host: + target = build_web_url(request.path, request=request) + if target: + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return get_response(request) + + return middleware + + +class ReverseProxyAuthMiddleware(RemoteUserMiddleware): + header = "HTTP_{normalized}".format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper()) + + def process_request(self, request): + if SERVER_CONFIG.REVERSE_PROXY_WHITELIST == "": + return + + ip = request.META.get("REMOTE_ADDR") + if not isinstance(ip, str): + return + + for cidr in SERVER_CONFIG.REVERSE_PROXY_WHITELIST.split(","): + try: + network = ipaddress.ip_network(cidr) + except ValueError: + raise ImproperlyConfigured( + "The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or " + "contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.", + ) + + if ipaddress.ip_address(ip) in network: + return super().process_request(request) diff --git a/archivebox/core/migrations/0001_initial.py b/archivebox/core/migrations/0001_initial.py index 73ac78e7f1..f64cdccab1 100644 --- a/archivebox/core/migrations/0001_initial.py +++ b/archivebox/core/migrations/0001_initial.py @@ -5,23 +5,21 @@ class Migration(migrations.Migration): - initial = True - dependencies = [ - ] + dependencies = [] operations = [ migrations.CreateModel( - name='Snapshot', + name="Snapshot", fields=[ - ('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), - ('url', models.URLField(unique=True)), - ('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)), - ('title', models.CharField(default=None, max_length=128, null=True)), - ('tags', models.CharField(default=None, max_length=256, null=True)), - ('added', models.DateTimeField(auto_now_add=True)), - ('updated', models.DateTimeField(default=None, null=True)), + ("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ("url", models.URLField(unique=True)), + ("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)), + ("title", models.CharField(default=None, max_length=128, null=True)), + ("tags", models.CharField(default=None, max_length=256, null=True)), + ("added", models.DateTimeField(auto_now_add=True)), + ("updated", models.DateTimeField(default=None, null=True)), ], ), ] diff --git a/archivebox/core/migrations/0002_auto_20200625_1521.py b/archivebox/core/migrations/0002_auto_20200625_1521.py index 4811282949..ff825ba642 100644 --- a/archivebox/core/migrations/0002_auto_20200625_1521.py +++ b/archivebox/core/migrations/0002_auto_20200625_1521.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0001_initial'), + ("core", "0001_initial"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(default=None, max_length=32, null=True), ), ] diff --git a/archivebox/core/migrations/0003_auto_20200630_1034.py b/archivebox/core/migrations/0003_auto_20200630_1034.py index 61fd472787..0d378f07b0 100644 --- a/archivebox/core/migrations/0003_auto_20200630_1034.py +++ b/archivebox/core/migrations/0003_auto_20200630_1034.py @@ -4,35 +4,34 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0002_auto_20200625_1521'), + ("core", "0002_auto_20200625_1521"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='added', + model_name="snapshot", + name="added", field=models.DateTimeField(auto_now_add=True, db_index=True), ), migrations.AlterField( - model_name='snapshot', - name='tags', + model_name="snapshot", + name="tags", field=models.CharField(db_index=True, default=None, max_length=256, null=True), ), migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(db_index=True, default=None, max_length=32, null=True), ), migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(db_index=True, default=None, max_length=128, null=True), ), migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(db_index=True, default=None, null=True), ), ] diff --git a/archivebox/core/migrations/0004_auto_20200713_1552.py b/archivebox/core/migrations/0004_auto_20200713_1552.py index 69836623d4..02f2738c95 100644 --- a/archivebox/core/migrations/0004_auto_20200713_1552.py +++ b/archivebox/core/migrations/0004_auto_20200713_1552.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0003_auto_20200630_1034'), + ("core", "0003_auto_20200630_1034"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='timestamp', + model_name="snapshot", + name="timestamp", field=models.CharField(db_index=True, default=None, max_length=32, unique=True), preserve_default=False, ), diff --git a/archivebox/core/migrations/0005_auto_20200728_0326.py b/archivebox/core/migrations/0005_auto_20200728_0326.py index f367aeb1aa..8b1c32e5c8 100644 --- a/archivebox/core/migrations/0005_auto_20200728_0326.py +++ b/archivebox/core/migrations/0005_auto_20200728_0326.py @@ -4,25 +4,24 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0004_auto_20200713_1552'), + ("core", "0004_auto_20200713_1552"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='tags', + model_name="snapshot", + name="tags", field=models.CharField(blank=True, db_index=True, max_length=256, null=True), ), migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=128, null=True), ), migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(blank=True, db_index=True, null=True), ), ] diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py index dc96c8dab2..103a28776a 100644 --- a/archivebox/core/migrations/0006_auto_20201012_1520.py +++ b/archivebox/core/migrations/0006_auto_20201012_1520.py @@ -3,29 +3,24 @@ from django.db import migrations, models from django.utils.text import slugify + def forwards_func(apps, schema_editor): SnapshotModel = apps.get_model("core", "Snapshot") TagModel = apps.get_model("core", "Tag") - db_alias = schema_editor.connection.alias snapshots = SnapshotModel.objects.all() for snapshot in snapshots: - tags = snapshot.tags - tag_set = ( - set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) - ) + tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")} tag_set.discard("") for tag in tag_set: - to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={'slug': slugify(tag)}) + to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)}) snapshot.tags.add(to_add) def reverse_func(apps, schema_editor): SnapshotModel = apps.get_model("core", "Snapshot") - TagModel = apps.get_model("core", "Tag") - db_alias = schema_editor.connection.alias snapshots = SnapshotModel.objects.all() for snapshot in snapshots: tags = snapshot.tags.values_list("name", flat=True) @@ -34,37 +29,36 @@ def reverse_func(apps, schema_editor): class Migration(migrations.Migration): - dependencies = [ - ('core', '0005_auto_20200728_0326'), + ("core", "0005_auto_20200728_0326"), ] operations = [ migrations.RenameField( - model_name='snapshot', - old_name='tags', - new_name='tags_old', + model_name="snapshot", + old_name="tags", + new_name="tags_old", ), migrations.CreateModel( - name='Tag', + name="Tag", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('name', models.CharField(max_length=100, unique=True, verbose_name='name')), - ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')), + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=100, unique=True, verbose_name="name")), + ("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")), ], options={ - 'verbose_name': 'Tag', - 'verbose_name_plural': 'Tags', + "verbose_name": "Tag", + "verbose_name_plural": "Tags", }, ), migrations.AddField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(to='core.Tag'), + model_name="snapshot", + name="tags", + field=models.ManyToManyField(to="core.Tag"), ), migrations.RunPython(forwards_func, reverse_func), migrations.RemoveField( - model_name='snapshot', - name='tags_old', + model_name="snapshot", + name="tags_old", ), ] diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 29b269f6f8..3b31b15cf0 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -6,30 +6,47 @@ from django.db import migrations, models import django.db.models.deletion -from config import CONFIG -from index.json import to_json +# Handle old vs new import paths +try: + from archivebox.config import CONSTANTS + + ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR +except ImportError: + try: + from archivebox.config import CONFIG + + ARCHIVE_DIR = Path(CONFIG.get("ARCHIVE_DIR", "./archive")) + except ImportError: + ARCHIVE_DIR = Path("./archive") + +try: + from archivebox.misc.util import to_json +except ImportError: + try: + from index.json import to_json + except ImportError: + to_json = lambda x: json.dumps(x, indent=4, default=str) try: JSONField = models.JSONField except AttributeError: import jsonfield + JSONField = jsonfield.JSONField def forwards_func(apps, schema_editor): - from core.models import EXTRACTORS - Snapshot = apps.get_model("core", "Snapshot") ArchiveResult = apps.get_model("core", "ArchiveResult") snapshots = Snapshot.objects.all() for snapshot in snapshots: - out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp try: - with open(out_dir / "index.json", "r") as f: + with open(out_dir / "index.json") as f: fs_index = json.load(f) - except Exception as e: + except Exception: continue history = fs_index["history"] @@ -42,37 +59,46 @@ def forwards_func(apps, schema_editor): snapshot=snapshot, pwd=result["pwd"], cmd=result.get("cmd") or [], - cmd_version=result.get("cmd_version") or 'unknown', + cmd_version=result.get("cmd_version") or "unknown", start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], - output=result.get("output") or 'null', + output=result.get("output") or "null", ) except Exception as e: print( - ' ! Skipping import due to missing/invalid index.json:', + " ! Skipping import due to missing/invalid index.json:", out_dir, e, - '(open an issue with this index.json for help)', + "(open an issue with this index.json for help)", ) def verify_json_index_integrity(snapshot): results = snapshot.archiveresult_set.all() - out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp - with open(out_dir / "index.json", "r") as f: + out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp + with open(out_dir / "index.json") as f: index = json.load(f) history = index["history"] index_results = [result for extractor in history for result in history[extractor]] flattened_results = [result["start_ts"] for result in index_results] - + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] for missing in missing_results: - index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(), - "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output, - "schema": "ArchiveResult", "status": missing.status}) + index["history"][missing.extractor].append( + { + "cmd": missing.cmd, + "cmd_version": missing.cmd_version, + "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), + "pwd": missing.pwd, + "output": missing.output, + "schema": "ArchiveResult", + "status": missing.status, + }, + ) json_index = to_json(index) with open(out_dir / "index.json", "w") as f: @@ -89,25 +115,47 @@ def reverse_func(apps, schema_editor): class Migration(migrations.Migration): - dependencies = [ - ('core', '0006_auto_20201012_1520'), + ("core", "0006_auto_20201012_1520"), ] operations = [ migrations.CreateModel( - name='ArchiveResult', + name="ArchiveResult", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('cmd', JSONField()), - ('pwd', models.CharField(max_length=256)), - ('cmd_version', models.CharField(max_length=32)), - ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), - ('output', models.CharField(max_length=512)), - ('start_ts', models.DateTimeField()), - ('end_ts', models.DateTimeField()), - ('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)), - ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("cmd", JSONField()), + ("pwd", models.CharField(max_length=256)), + ("cmd_version", models.CharField(max_length=32)), + ( + "status", + models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16), + ), + ("output", models.CharField(max_length=512)), + ("start_ts", models.DateTimeField()), + ("end_ts", models.DateTimeField()), + ( + "extractor", + models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("wget", "wget"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("headers", "headers"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")), ], ), migrations.RunPython(forwards_func, reverse_func), diff --git a/archivebox/core/migrations/0008_auto_20210105_1421.py b/archivebox/core/migrations/0008_auto_20210105_1421.py index e5b3387d42..68c408e7e4 100644 --- a/archivebox/core/migrations/0008_auto_20210105_1421.py +++ b/archivebox/core/migrations/0008_auto_20210105_1421.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0007_archiveresult'), + ("core", "0007_archiveresult"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='cmd_version', + model_name="archiveresult", + name="cmd_version", field=models.CharField(blank=True, default=None, max_length=32, null=True), ), ] diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py index 2817fe547e..41747426ac 100644 --- a/archivebox/core/migrations/0009_auto_20210216_1038.py +++ b/archivebox/core/migrations/0009_auto_20210216_1038.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0008_auto_20210105_1421'), + ("core", "0008_auto_20210105_1421"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='updated', + model_name="snapshot", + name="updated", field=models.DateTimeField(auto_now=True, db_index=True, null=True), ), ] diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py index 0af61a3966..14bc18fd01 100644 --- a/archivebox/core/migrations/0010_auto_20210216_1055.py +++ b/archivebox/core/migrations/0010_auto_20210216_1055.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0009_auto_20210216_1038'), + ("core", "0009_auto_20210216_1038"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='start_ts', + model_name="archiveresult", + name="start_ts", field=models.DateTimeField(db_index=True), ), ] diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py index d222667419..a1f6e7539c 100644 --- a/archivebox/core/migrations/0011_auto_20210216_1331.py +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -5,20 +5,36 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0010_auto_20210216_1055'), + ("core", "0010_auto_20210216_1055"), ] operations = [ migrations.AddField( - model_name='archiveresult', - name='uuid', + model_name="archiveresult", + name="uuid", field=models.UUIDField(default=uuid.uuid4, editable=False), ), migrations.AlterField( - model_name='archiveresult', - name='extractor', - field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), ), ] diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py index 310058ac6c..27beb89797 100644 --- a/archivebox/core/migrations/0012_auto_20210216_1425.py +++ b/archivebox/core/migrations/0012_auto_20210216_1425.py @@ -4,20 +4,19 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0011_auto_20210216_1331'), + ("core", "0011_auto_20210216_1331"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='cmd_version', + model_name="archiveresult", + name="cmd_version", field=models.CharField(blank=True, default=None, max_length=128, null=True), ), migrations.AlterField( - model_name='archiveresult', - name='output', + model_name="archiveresult", + name="output", field=models.CharField(max_length=1024), ), ] diff --git a/archivebox/core/migrations/0013_auto_20210218_0729.py b/archivebox/core/migrations/0013_auto_20210218_0729.py index d3fe3b4f51..a774f156fa 100644 --- a/archivebox/core/migrations/0013_auto_20210218_0729.py +++ b/archivebox/core/migrations/0013_auto_20210218_0729.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0012_auto_20210216_1425'), + ("core", "0012_auto_20210216_1425"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=256, null=True), ), ] diff --git a/archivebox/core/migrations/0014_auto_20210218_0729.py b/archivebox/core/migrations/0014_auto_20210218_0729.py index db81934f67..d14211a6fd 100644 --- a/archivebox/core/migrations/0014_auto_20210218_0729.py +++ b/archivebox/core/migrations/0014_auto_20210218_0729.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0013_auto_20210218_0729'), + ("core", "0013_auto_20210218_0729"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=1024, null=True), ), ] diff --git a/archivebox/core/migrations/0015_auto_20210218_0730.py b/archivebox/core/migrations/0015_auto_20210218_0730.py index b782a21743..e2d99cdb9b 100644 --- a/archivebox/core/migrations/0015_auto_20210218_0730.py +++ b/archivebox/core/migrations/0015_auto_20210218_0730.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0014_auto_20210218_0729'), + ("core", "0014_auto_20210218_0729"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='title', + model_name="snapshot", + name="title", field=models.CharField(blank=True, db_index=True, max_length=512, null=True), ), ] diff --git a/archivebox/core/migrations/0016_auto_20210218_1204.py b/archivebox/core/migrations/0016_auto_20210218_1204.py index 4637feab3c..1b9961729a 100644 --- a/archivebox/core/migrations/0016_auto_20210218_1204.py +++ b/archivebox/core/migrations/0016_auto_20210218_1204.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0015_auto_20210218_0730'), + ("core", "0015_auto_20210218_0730"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(blank=True, to='core.Tag'), + model_name="snapshot", + name="tags", + field=models.ManyToManyField(blank=True, to="core.Tag"), ), ] diff --git a/archivebox/core/migrations/0017_auto_20210219_0211.py b/archivebox/core/migrations/0017_auto_20210219_0211.py index 221a250b41..4a9a4c827a 100644 --- a/archivebox/core/migrations/0017_auto_20210219_0211.py +++ b/archivebox/core/migrations/0017_auto_20210219_0211.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0016_auto_20210218_1204'), + ("core", "0016_auto_20210218_1204"), ] operations = [ migrations.AlterField( - model_name='tag', - name='slug', - field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'), + model_name="tag", + name="slug", + field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"), ), ] diff --git a/archivebox/core/migrations/0018_auto_20210327_0952.py b/archivebox/core/migrations/0018_auto_20210327_0952.py index d0f3dde10d..dc5b2d1f58 100644 --- a/archivebox/core/migrations/0018_auto_20210327_0952.py +++ b/archivebox/core/migrations/0018_auto_20210327_0952.py @@ -4,20 +4,19 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0017_auto_20210219_0211'), + ("core", "0017_auto_20210219_0211"), ] operations = [ migrations.AlterField( - model_name='tag', - name='name', + model_name="tag", + name="name", field=models.CharField(max_length=100, unique=True), ), migrations.AlterField( - model_name='tag', - name='slug', + model_name="tag", + name="slug", field=models.SlugField(blank=True, max_length=100, unique=True), ), ] diff --git a/archivebox/core/migrations/0019_auto_20210401_0654.py b/archivebox/core/migrations/0019_auto_20210401_0654.py index 735a654907..846bb61961 100644 --- a/archivebox/core/migrations/0019_auto_20210401_0654.py +++ b/archivebox/core/migrations/0019_auto_20210401_0654.py @@ -4,15 +4,14 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0018_auto_20210327_0952'), + ("core", "0018_auto_20210327_0952"), ] operations = [ migrations.AlterField( - model_name='snapshot', - name='url', + model_name="snapshot", + name="url", field=models.URLField(db_index=True, unique=True), ), ] diff --git a/archivebox/core/migrations/0020_auto_20210410_1031.py b/archivebox/core/migrations/0020_auto_20210410_1031.py index e75243c6e0..610eaa43b6 100644 --- a/archivebox/core/migrations/0020_auto_20210410_1031.py +++ b/archivebox/core/migrations/0020_auto_20210410_1031.py @@ -4,20 +4,19 @@ class Migration(migrations.Migration): - dependencies = [ - ('core', '0019_auto_20210401_0654'), + ("core", "0019_auto_20210401_0654"), ] operations = [ migrations.AlterField( - model_name='archiveresult', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + model_name="archiveresult", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), ), migrations.AlterField( - model_name='tag', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + model_name="tag", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), ), ] diff --git a/archivebox/core/migrations/0021_auto_20220914_0934.py b/archivebox/core/migrations/0021_auto_20220914_0934.py new file mode 100644 index 0000000000..3f757723fd --- /dev/null +++ b/archivebox/core/migrations/0021_auto_20220914_0934.py @@ -0,0 +1,34 @@ +# Generated by Django 3.1.14 on 2022-09-14 09:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0020_auto_20210410_1031"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ] diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py new file mode 100644 index 0000000000..43dd1a69b3 --- /dev/null +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -0,0 +1,35 @@ +# Generated by Django 3.1.14 on 2023-10-23 20:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0021_auto_20220914_0934"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("htmltotext", "htmltotext"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ] diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py new file mode 100644 index 0000000000..ea7bcb4419 --- /dev/null +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -0,0 +1,399 @@ +# Generated by hand on 2025-12-29 +# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL +# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0 + +from django.db import migrations, models, connection +import django.utils.timezone + + +def get_table_columns(table_name): + """Get list of column names for a table.""" + cursor = connection.cursor() + cursor.execute(f"PRAGMA table_info({table_name})") + return {row[1] for row in cursor.fetchall()} + + +def upgrade_core_tables(apps, schema_editor): + """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0.""" + from archivebox.uuid_compat import uuid7 + + cursor = connection.cursor() + + # Check if core_archiveresult table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + # Fresh install - no migration needed, tables will be created by later migrations + return + + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + row_count = cursor.fetchone()[0] + has_data = row_count > 0 + + # Detect which version we're migrating from + archiveresult_cols = get_table_columns("core_archiveresult") + has_uuid = "uuid" in archiveresult_cols + has_abid = "abid" in archiveresult_cols + + print(f"DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}") + + # ============================================================================ + # PART 1: Upgrade core_archiveresult table + # ============================================================================ + # Create minimal table with only OLD fields that exist in v0.7.2/v0.8.6rc0 + # Migration 0025 will add the NEW fields (plugin, hook_name, output_files, etc.) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_archiveresult_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT, + snapshot_id TEXT NOT NULL, + cmd TEXT, + pwd VARCHAR(256), + cmd_version VARCHAR(128), + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + extractor VARCHAR(32), + output VARCHAR(1024), + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE + ); + """) + + if has_data: + if has_uuid and not has_abid: + # Migrating from v0.7.2+ (has uuid column) + print("Migrating ArchiveResult from v0.7.2+ schema (with uuid)...") + cursor.execute(""" + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + ) + SELECT + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + FROM core_archiveresult; + """) + elif has_abid and not has_uuid: + # Migrating from v0.8.6rc0 (has abid instead of uuid) + print("Migrating ArchiveResult from v0.8.6rc0 schema...") + cursor.execute(""" + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + ) + SELECT + id, abid as uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + FROM core_archiveresult; + """) + else: + # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs) + print("Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...") + cursor.execute( + "SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult", + ) + old_records = cursor.fetchall() + for record in old_records: + new_uuid = uuid7().hex + cursor.execute( + """ + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + record[0], + new_uuid, + record[1], + record[2], + record[3], + record[4], + record[5], + record[6], + record[7], + record[8], + record[9], + ), + ) + + cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") + + # Don't create indexes - migration 0025 will handle them + + # ============================================================================ + # PART 2: Upgrade core_snapshot table + # ============================================================================ + # Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at) + # and all other fields needed by later migrations + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_snapshot_new ( + id TEXT PRIMARY KEY NOT NULL, + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + title VARCHAR(512), + crawl_id TEXT, + parent_snapshot_id TEXT, + + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + downloaded_at DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0', + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + current_step INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); + """) + + # Check if core_snapshot exists (it should) + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'") + if cursor.fetchone(): + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_snapshot") + snapshot_has_data = cursor.fetchone()[0] > 0 + + if snapshot_has_data: + # Detect which version we're migrating from + snapshot_cols = get_table_columns("core_snapshot") + has_added = "added" in snapshot_cols + has_bookmarked_at = "bookmarked_at" in snapshot_cols + + if has_added and not has_bookmarked_at: + # Migrating from v0.7.2 (has added/updated fields) + print("Migrating Snapshot from v0.7.2 schema...") + # Transform added→bookmarked_at/created_at and updated→modified_at + cursor.execute(""" + INSERT OR IGNORE INTO core_snapshot_new ( + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + status + ) + SELECT + id, url, timestamp, title, + COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at, + COALESCE(added, CURRENT_TIMESTAMP) as created_at, + COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at, + 'queued' as status + FROM core_snapshot; + """) + elif has_bookmarked_at and not has_added: + # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at) + print("Migrating Snapshot from v0.8.6rc0 schema...") + # Check what fields exist + has_status = "status" in snapshot_cols + has_retry_at = "retry_at" in snapshot_cols + has_crawl_id = "crawl_id" in snapshot_cols + + # Build column list based on what exists + cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"] + if has_crawl_id: + cols.append("crawl_id") + if has_status: + cols.append("status") + if has_retry_at: + cols.append("retry_at") + + cursor.execute(f""" + INSERT OR IGNORE INTO core_snapshot_new ({", ".join(cols)}) + SELECT {", ".join(cols)} + FROM core_snapshot; + """) + else: + print(f"Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}") + + cursor.execute("DROP TABLE IF EXISTS core_snapshot;") + cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;") + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);") + cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);") + + # ============================================================================ + # PART 3: Upgrade core_tag table + # ============================================================================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_tag_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + name VARCHAR(100) NOT NULL UNIQUE, + slug VARCHAR(100) NOT NULL UNIQUE, + + created_by_id INTEGER, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + """) + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'") + if cursor.fetchone(): + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_tag") + tag_has_data = cursor.fetchone()[0] > 0 + + if tag_has_data: + cursor.execute("PRAGMA table_info(core_tag)") + tag_id_type = None + for row in cursor.fetchall(): + if row[1] == "id": # row[1] is column name + tag_id_type = row[2] # row[2] is type + break + + if tag_id_type and "char" in tag_id_type.lower(): + # v0.8.6rc0: Tag IDs are UUIDs, need to convert to INTEGER + print("Converting Tag IDs from UUID to INTEGER...") + + # Get all tags with their UUIDs + cursor.execute("SELECT id, name, slug, created_at, modified_at, created_by_id FROM core_tag ORDER BY name") + tags = cursor.fetchall() + + # Create mapping from old UUID to new INTEGER ID + uuid_to_int_map = {} + for i, tag in enumerate(tags, start=1): + old_id, name, slug, created_at, modified_at, created_by_id = tag + uuid_to_int_map[old_id] = i + # Insert with new INTEGER ID + cursor.execute( + """ + INSERT OR IGNORE INTO core_tag_new (id, name, slug, created_at, modified_at, created_by_id) + VALUES (?, ?, ?, ?, ?, ?) + """, + (i, name, slug, created_at, modified_at, created_by_id), + ) + + # Update snapshot_tags to use new INTEGER IDs + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot_tags'") + if cursor.fetchone(): + cursor.execute("SELECT id, snapshot_id, tag_id FROM core_snapshot_tags") + snapshot_tags = cursor.fetchall() + + # Delete old entries + cursor.execute("DELETE FROM core_snapshot_tags") + + # Re-insert with new integer tag IDs + for st_id, snapshot_id, old_tag_id in snapshot_tags: + new_tag_id = uuid_to_int_map.get(old_tag_id) + if new_tag_id: + cursor.execute( + """ + INSERT OR IGNORE INTO core_snapshot_tags (id, snapshot_id, tag_id) + VALUES (?, ?, ?) + """, + (st_id, snapshot_id, new_tag_id), + ) + else: + # v0.7.2: Tag IDs are already INTEGER + print("Migrating Tag from v0.7.2 schema...") + cursor.execute(""" + INSERT OR IGNORE INTO core_tag_new (id, name, slug) + SELECT id, name, slug + FROM core_tag; + """) + + cursor.execute("DROP TABLE IF EXISTS core_tag;") + cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;") + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);") + + if has_data: + print("✓ Core tables upgraded to v0.9.0") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0022_auto_20231023_2008"), + ("crawls", "0001_initial"), + ("auth", "0012_alter_user_first_name_max_length"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + upgrade_core_tables, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # NOTE: We do NOT remove extractor/output for ArchiveResult! + # They are still in the database and will be removed by migration 0025 + # after copying their data to plugin/output_str. + # However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields + # because the SQL above already transformed them. + migrations.RemoveField(model_name="snapshot", name="added"), + migrations.RemoveField(model_name="snapshot", name="updated"), + migrations.AddField( + model_name="snapshot", + name="bookmarked_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="snapshot", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="snapshot", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + # Declare fs_version (already created in database with DEFAULT '0.8.0') + migrations.AddField( + model_name="snapshot", + name="fs_version", + field=models.CharField( + max_length=10, + default="0.8.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + ), + ), + # SnapshotTag table already exists from v0.7.2, just declare it in state + migrations.CreateModel( + name="SnapshotTag", + fields=[ + ("id", models.AutoField(primary_key=True, serialize=False)), + ("snapshot", models.ForeignKey(to="core.Snapshot", db_column="snapshot_id", on_delete=models.CASCADE)), + ("tag", models.ForeignKey(to="core.Tag", db_column="tag_id", on_delete=models.CASCADE)), + ], + options={ + "db_table": "core_snapshot_tags", + "unique_together": {("snapshot", "tag")}, + }, + ), + # Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2) + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField( + "Tag", + blank=True, + related_name="snapshot_set", + through="SnapshotTag", + through_fields=("snapshot", "tag"), + ), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py new file mode 100644 index 0000000000..e5dd70d806 --- /dev/null +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -0,0 +1,155 @@ +# Generated by hand on 2025-12-29 +# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL + +from django.db import migrations, models + + +def create_default_crawl_and_assign_snapshots(apps, schema_editor): + """ + Create a default crawl for migrated snapshots and assign all snapshots without a crawl to it. + Uses raw SQL because the app registry isn't fully populated during migrations. + """ + from django.db import connection + import uuid as uuid_lib + from datetime import datetime + + cursor = connection.cursor() + + # Check if there are any snapshots without a crawl + cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE crawl_id IS NULL") + snapshots_without_crawl = cursor.fetchone()[0] + + if snapshots_without_crawl == 0: + print("✓ Fresh install or all snapshots already have crawls") + return + + # Get or create system user (pk=1) + cursor.execute("SELECT id FROM auth_user WHERE id = 1") + if not cursor.fetchone(): + cursor.execute( + """ + INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined) + VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?) + """, + [datetime.now().isoformat()], + ) + + # Create a default crawl for migrated snapshots + # At this point crawls_crawl is guaranteed to have v0.9.0 schema (crawls/0002 ran first) + crawl_id = str(uuid_lib.uuid4()) + now = datetime.now().isoformat() + + cursor.execute( + """ + INSERT INTO crawls_crawl ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, max_depth, tags_str, label, notes, output_dir, + status, retry_at, created_by_id, schedule_id, config, persona_id + ) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2/v0.8.6', + 'Auto-created crawl for migrated snapshots', '', + 'sealed', ?, 1, NULL, '{}', NULL) + """, + [crawl_id, now, now, now], + ) + + # Assign all snapshots without a crawl to the default crawl + cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id]) + + print(f"✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0023_upgrade_to_0_9_0"), + ("crawls", "0002_upgrade_from_0_8_6"), + ("auth", "0012_alter_user_first_name_max_length"), + ] + + operations = [ + migrations.RunPython( + create_default_crawl_and_assign_snapshots, + reverse_code=migrations.RunPython.noop, + ), + migrations.SeparateDatabaseAndState( + database_operations=[ + # Now make crawl_id NOT NULL + migrations.RunSQL( + sql=""" + -- Rebuild snapshot table with NOT NULL crawl_id + CREATE TABLE core_snapshot_final ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + crawl_id TEXT NOT NULL, + parent_snapshot_id TEXT, + + title VARCHAR(512), + downloaded_at DATETIME, + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + current_step INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); + + INSERT INTO core_snapshot_final ( + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + crawl_id, parent_snapshot_id, + downloaded_at, depth, fs_version, + config, notes, + num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + ) + SELECT + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + crawl_id, parent_snapshot_id, + downloaded_at, depth, fs_version, + COALESCE(config, '{}'), COALESCE(notes, ''), + num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + FROM core_snapshot; + + DROP TABLE core_snapshot; + ALTER TABLE core_snapshot_final RENAME TO core_snapshot; + + CREATE INDEX core_snapshot_url_idx ON core_snapshot(url); + CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp); + CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at); + CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id); + CREATE INDEX core_snapshot_status_idx ON core_snapshot(status); + CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at); + CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at); + CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], + state_operations=[ + migrations.AddField( + model_name="snapshot", + name="crawl", + field=models.ForeignKey( + on_delete=models.deletion.CASCADE, + to="crawls.crawl", + help_text="Crawl that created this snapshot", + ), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py new file mode 100644 index 0000000000..9d29d8c6aa --- /dev/null +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -0,0 +1,329 @@ +# Generated by Django 6.0 on 2025-12-31 23:09 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models, connection + +from archivebox.uuid_compat import uuid7 + + +def copy_old_fields_to_new(apps, schema_editor): + """Copy data from old field names to new field names after AddField operations.""" + cursor = connection.cursor() + + # Check if old fields still exist + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + + if "extractor" in cols and "plugin" in cols: + # Copy extractor -> plugin + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL") + + if "output" in cols and "output_str" in cols: + # Copy output -> output_str + cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL") + + # Copy timestamps to new timestamp fields if they don't have values yet + if "start_ts" in cols and "created_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''", + ) + + if "end_ts" in cols and "modified_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''", + ) + + # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already + # transformed by migration 0023, so we don't need to copy them here. + # NOTE: UUIDs are already populated by migration 0023 for all migration paths + + # Debug: Check Snapshot timestamps at end of RunPython + cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2") + snap_after = cursor.fetchall() + print(f"DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0024_assign_default_crawl"), + ("crawls", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterModelOptions( + name="archiveresult", + options={"verbose_name": "Archive Result", "verbose_name_plural": "Archive Results Log"}, + ), + migrations.AlterModelOptions( + name="snapshot", + options={"verbose_name": "Snapshot", "verbose_name_plural": "Snapshots"}, + ), + # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027 + # to allow data migration to Process records first + migrations.AddField( + model_name="archiveresult", + name="config", + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name="archiveresult", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="archiveresult", + name="hook_name", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + max_length=255, + ), + ), + migrations.AddField( + model_name="archiveresult", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name="archiveresult", + name="notes", + field=models.TextField(blank=True, default=""), + ), + migrations.AddField( + model_name="archiveresult", + name="num_uses_failed", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="archiveresult", + name="num_uses_succeeded", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="archiveresult", + name="output_files", + field=models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}"), + ), + migrations.AddField( + model_name="archiveresult", + name="output_json", + field=models.JSONField(blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)", null=True), + ), + migrations.AddField( + model_name="archiveresult", + name="output_mimetypes", + field=models.CharField(blank=True, default="", help_text="CSV of mimetypes sorted by size", max_length=512), + ), + migrations.AddField( + model_name="archiveresult", + name="output_size", + field=models.BigIntegerField(default=0, help_text="Total bytes of all output files"), + ), + migrations.AddField( + model_name="archiveresult", + name="output_str", + field=models.TextField(blank=True, default="", help_text="Human-readable output summary"), + ), + migrations.AddField( + model_name="archiveresult", + name="plugin", + field=models.CharField(db_index=True, default="", max_length=32), + ), + migrations.AddField( + model_name="archiveresult", + name="retry_at", + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + # NOTE: bookmarked_at and created_at already added by migration 0023 + migrations.AddField( + model_name="snapshot", + name="config", + field=models.JSONField(default=dict), + ), + migrations.AddField( + model_name="snapshot", + name="current_step", + field=models.PositiveSmallIntegerField( + db_index=True, + default=0, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ), + ), + migrations.AddField( + model_name="snapshot", + name="depth", + field=models.PositiveSmallIntegerField(db_index=True, default=0), + ), + migrations.AddField( + model_name="snapshot", + name="downloaded_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + # NOTE: fs_version already added by migration 0023 with default='0.8.0' + # NOTE: modified_at already added by migration 0023 + migrations.AddField( + model_name="snapshot", + name="notes", + field=models.TextField(blank=True, default=""), + ), + migrations.AddField( + model_name="snapshot", + name="num_uses_failed", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="snapshot", + name="num_uses_succeeded", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="snapshot", + name="parent_snapshot", + field=models.ForeignKey( + blank=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="child_snapshots", + to="core.snapshot", + ), + ), + migrations.AddField( + model_name="snapshot", + name="retry_at", + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name="snapshot", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + migrations.AddField( + model_name="tag", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name="tag", + name="created_by", + field=models.ForeignKey( + default=archivebox.base_models.models.get_or_create_system_user_pk, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="tag_set", + to=settings.AUTH_USER_MODEL, + ), + ), + migrations.AddField( + model_name="tag", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + # Copy data from old field names to new field names after AddField operations + migrations.RunPython( + copy_old_fields_to_new, + reverse_code=migrations.RunPython.noop, + ), + # Now remove the old ArchiveResult fields after data has been copied + migrations.RemoveField( + model_name="archiveresult", + name="extractor", + ), + migrations.RemoveField( + model_name="archiveresult", + name="output", + ), + # NOTE: Snapshot's added/updated were already removed by migration 0023 + migrations.AlterField( + model_name="archiveresult", + name="end_ts", + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name="archiveresult", + name="start_ts", + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ], + db_index=True, + default="queued", + max_length=15, + ), + ), + migrations.AlterField( + model_name="archiveresult", + name="uuid", + field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="crawl", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="snapshot_set", to="crawls.crawl"), + ), + migrations.AlterField( + model_name="snapshot", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField( + blank=True, + related_name="snapshot_set", + through="core.SnapshotTag", + through_fields=("snapshot", "tag"), + to="core.tag", + ), + ), + migrations.AlterField( + model_name="snapshot", + name="timestamp", + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + migrations.AlterField( + model_name="snapshot", + name="url", + field=models.URLField(db_index=True), + ), + migrations.AlterField( + model_name="tag", + name="slug", + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + migrations.AddConstraint( + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("url", "crawl"), name="unique_url_per_crawl"), + ), + migrations.AddConstraint( + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("timestamp",), name="unique_timestamp"), + ), + ] diff --git a/archivebox/core/migrations/0026_add_process_to_archiveresult.py b/archivebox/core/migrations/0026_add_process_to_archiveresult.py new file mode 100644 index 0000000000..7381b98ee1 --- /dev/null +++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py @@ -0,0 +1,34 @@ +# Generated by Django 6.0 on 2026-01-01 23:28 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0025_alter_archiveresult_options_alter_snapshot_options_and_more"), + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="num_uses_failed", + ), + migrations.RemoveField( + model_name="archiveresult", + name="num_uses_succeeded", + ), + migrations.AddField( + model_name="archiveresult", + name="process", + field=models.OneToOneField( + blank=True, + help_text="Process execution details for this archive result", + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="archiveresult", + to="machine.process", + ), + ), + ] diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py new file mode 100644 index 0000000000..37c4f8df1d --- /dev/null +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -0,0 +1,418 @@ +# Generated by hand on 2026-01-01 +# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields + +from django.db import migrations, connection +import json +from pathlib import Path +from archivebox.uuid_compat import uuid7 + + +def parse_cmd_field(cmd_raw): + """ + Parse cmd field which could be: + 1. JSON array string: '["wget", "-p", "url"]' + 2. Space-separated string: 'wget -p url' + 3. NULL/empty + + Returns list of strings. + """ + if not cmd_raw: + return [] + + cmd_raw = cmd_raw.strip() + + if not cmd_raw: + return [] + + # Try to parse as JSON first + if cmd_raw.startswith("["): + try: + parsed = json.loads(cmd_raw) + if isinstance(parsed, list): + return [str(x) for x in parsed] + except json.JSONDecodeError: + pass + + # Fallback: split by spaces (simple approach, doesn't handle quoted strings) + # This is acceptable since old cmd fields were mostly simple commands + return cmd_raw.split() + + +def get_or_create_current_machine(cursor): + """Get or create Machine.current() using raw SQL.""" + import socket + from datetime import datetime + + # Simple machine detection - get hostname as guid + hostname = socket.gethostname() + guid = f"host_{hostname}" # Simple but stable identifier + + # Check if machine exists + cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid]) + row = cursor.fetchone() + + if row: + return row[0] + + # Create new machine + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + machine_id = uuid7().hex + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_machine)") + machine_cols = {row[1] for row in cursor.fetchall()} + + # Build INSERT statement based on available columns + if "config" in machine_cols: + # 0.9.x schema with config column + cursor.execute( + """ + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, config, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', '{}', 0, 0) + """, + [machine_id, now, now, guid, hostname], + ) + else: + # 0.8.x schema without config column + cursor.execute( + """ + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', 0, 0) + """, + [machine_id, now, now, guid, hostname], + ) + + return machine_id + + +def get_or_create_binary(cursor, machine_id, name, abspath, version): + """ + Get or create Binary record. + + Args: + cursor: DB cursor + machine_id: Machine FK + name: Binary name (basename of command) + abspath: Absolute path to binary (or just name if path unknown) + version: Version string + + Returns: + binary_id (str) + """ + from datetime import datetime + + # If abspath is just a name without slashes, it's not a full path + # Store it in both fields for simplicity + if "/" not in abspath: + # Not a full path - store as-is + pass + + # Check if binary exists with same machine, name, abspath, version + cursor.execute( + """ + SELECT id FROM machine_binary + WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ? + """, + [machine_id, name, abspath, version], + ) + + row = cursor.fetchone() + if row: + return row[0] + + # Create new binary + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + binary_id = uuid7().hex + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_binary)") + binary_cols = {row[1] for row in cursor.fetchall()} + + # Use only columns that exist in current schema + # 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded + # 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir + if "binproviders" in binary_cols: + # 0.9.x schema + cursor.execute( + """ + INSERT INTO machine_binary ( + id, created_at, modified_at, machine_id, + name, binproviders, overrides, binprovider, abspath, version, sha256, + status, retry_at, output_dir, + num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '', + 'succeeded', NULL, '', 0, 0) + """, + [binary_id, now, now, machine_id, name, abspath, version], + ) + else: + # 0.8.x schema (simpler) + cursor.execute( + """ + INSERT INTO machine_binary ( + id, created_at, modified_at, machine_id, + name, binprovider, abspath, version, sha256, + num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0) + """, + [binary_id, now, now, machine_id, name, abspath, version], + ) + + return binary_id + + +def map_status(old_status): + """ + Map old ArchiveResult status to Process status and exit_code. + + Args: + old_status: One of: queued, started, backoff, succeeded, failed, skipped + + Returns: + (process_status, exit_code) tuple + """ + status_map = { + "queued": ("queued", None), + "started": ("running", None), + "backoff": ("queued", None), + "succeeded": ("exited", 0), + "failed": ("exited", 1), + "skipped": ("exited", None), # Skipped = exited without error + } + + return status_map.get(old_status, ("queued", None)) + + +def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id): + """ + Create a Process record. + + Returns: + process_id (str) + """ + from datetime import datetime + + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + process_id = uuid7().hex + now = datetime.now().isoformat() + + # Convert cmd array to JSON + cmd_json = json.dumps(cmd) + + # Set retry_at to now for queued processes, NULL otherwise + retry_at = now if status == "queued" else None + + cursor.execute( + """ + INSERT INTO machine_process ( + id, created_at, modified_at, machine_id, parent_id, process_type, + pwd, cmd, env, timeout, + pid, exit_code, stdout, stderr, + started_at, ended_at, + binary_id, iface_id, url, + status, retry_at + ) VALUES (?, ?, ?, ?, NULL, 'cli', + ?, ?, '{}', 120, + NULL, ?, '', '', + ?, ?, + ?, NULL, NULL, + ?, ?) + """, + [ + process_id, + now, + now, + machine_id, + pwd, + cmd_json, + exit_code, + started_at, + ended_at, + binary_id, + status, + retry_at, + ], + ) + + return process_id + + +def copy_archiveresult_data_to_process(apps, schema_editor): + """ + Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records. + + For each ArchiveResult without a process_id: + 1. Parse cmd field (handle both JSON array and space-separated string) + 2. Extract binary name/path from cmd[0] + 3. Get or create Binary record with machine, name, abspath, version + 4. Create Process record with mapped fields + 5. Link ArchiveResult.process_id to new Process + + Status mapping: + - queued → queued (exit_code=None) + - started → running (exit_code=None) + - backoff → queued (exit_code=None) + - succeeded → exited (exit_code=0) + - failed → exited (exit_code=1) + - skipped → exited (exit_code=None) + """ + cursor = connection.cursor() + + # Check if old fields still exist (skip if fresh install or already migrated) + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + + print(f"DEBUG 0027: Columns found: {sorted(cols)}") + print( + f"DEBUG 0027: Has cmd={('cmd' in cols)}, pwd={('pwd' in cols)}, cmd_version={('cmd_version' in cols)}, process_id={('process_id' in cols)}", + ) + + if "cmd" not in cols or "pwd" not in cols or "cmd_version" not in cols: + print("✓ Fresh install or fields already removed - skipping data copy") + return + + # Check if process_id field exists (should exist from 0026) + if "process_id" not in cols: + print("✗ ERROR: process_id field not found. Migration 0026 must run first.") + return + + # Get or create Machine.current() + machine_id = get_or_create_current_machine(cursor) + + # Get ArchiveResults without process_id that have cmd data + # Use plugin (extractor was renamed to plugin in migration 0025) + cursor.execute(""" + SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version, + status, start_ts, end_ts, created_at + FROM core_archiveresult + WHERE process_id IS NULL + AND (cmd IS NOT NULL OR pwd IS NOT NULL) + """) + + results = cursor.fetchall() + + if not results: + print("✓ No ArchiveResults need Process migration") + return + + print(f"Migrating {len(results)} ArchiveResults to Process records...") + + migrated_count = 0 + skipped_count = 0 + error_count = 0 + + for i, row in enumerate(results): + ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row + + if i == 0: + print(f"DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}") + + try: + # Parse cmd field + cmd_array = parse_cmd_field(cmd_raw) + + if i == 0: + print(f"DEBUG 0027: Parsed cmd: {cmd_array}") + + # Extract binary info from cmd[0] if available + binary_id = None + if cmd_array and cmd_array[0]: + binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name + binary_abspath = cmd_array[0] + binary_version = cmd_version or "" + + # Get or create Binary record + binary_id = get_or_create_binary( + cursor, + machine_id, + binary_name, + binary_abspath, + binary_version, + ) + + if i == 0: + print(f"DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}") + + # Map status + process_status, exit_code = map_status(status) + + # Set timestamps + started_at = start_ts or created_at + ended_at = end_ts if process_status == "exited" else None + + # Create Process record + process_id = create_process( + cursor=cursor, + machine_id=machine_id, + pwd=pwd or "", + cmd=cmd_array, + status=process_status, + exit_code=exit_code, + started_at=started_at, + ended_at=ended_at, + binary_id=binary_id, + ) + + if i == 0: + print(f"DEBUG 0027: Created Process: id={process_id}") + + # Link ArchiveResult to Process + cursor.execute( + "UPDATE core_archiveresult SET process_id = ? WHERE id = ?", + [process_id, ar_id], + ) + + migrated_count += 1 + + if i == 0: + print("DEBUG 0027: Linked ArchiveResult to Process") + + except Exception as e: + print(f"✗ Error migrating ArchiveResult {ar_id}: {e}") + import traceback + + traceback.print_exc() + error_count += 1 + continue + + print(f"✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0026_add_process_to_archiveresult"), + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + # First, copy data from old fields to Process + migrations.RunPython( + copy_archiveresult_data_to_process, + reverse_code=migrations.RunPython.noop, + ), + # Now safe to remove old fields (moved from 0025) + migrations.RemoveField( + model_name="archiveresult", + name="cmd", + ), + migrations.RemoveField( + model_name="archiveresult", + name="cmd_version", + ), + migrations.RemoveField( + model_name="archiveresult", + name="pwd", + ), + ] diff --git a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py new file mode 100644 index 0000000000..1459f4ef84 --- /dev/null +++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py @@ -0,0 +1,21 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0027_copy_archiveresult_to_process"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="fs_version", + field=models.CharField( + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + max_length=10, + ), + ), + ] diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py new file mode 100644 index 0000000000..7ed7d36ede --- /dev/null +++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py @@ -0,0 +1,218 @@ +# Generated by hand on 2026-01-02 +# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot) + +from django.db import migrations, models, connection +from uuid import UUID +from archivebox.uuid_compat import uuid7 + + +def migrate_archiveresult_id_to_uuid(apps, schema_editor): + """ + Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration). + + Handles both migration paths: + - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs + - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs + + Strategy: + 1. Create new table with UUID as primary key (no temporary columns) + 2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x) + 3. Copy all data with UUID as new id + 4. Drop old table, rename new table + 5. Recreate indexes + + Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid) + """ + cursor = connection.cursor() + + # Check if table exists and has data + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + print("ArchiveResult table does not exist, skipping migration") + return + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + row_count = cursor.fetchone()[0] + + # Don't skip if table is empty - we still need to recreate to remove uuid column + # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029) + + if row_count == 0: + print("[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...") + else: + print(f"[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...") + + # Step 0: Check if machine_process table exists, if not NULL out process_id values + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'") + machine_process_exists = cursor.fetchone() is not None + + if not machine_process_exists: + print("machine_process table does not exist yet, setting process_id to NULL") + cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL") + + # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns) + cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL, + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + start_ts DATETIME, + end_ts DATETIME, + + output_str TEXT NOT NULL DEFAULT '', + output_json TEXT, + output_files TEXT NOT NULL DEFAULT '{}', + output_size BIGINT NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + process_id TEXT, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, + FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL + ); + """) + + # Step 2: Generate UUIDs for records that don't have them + # Check if uuid column exists (0.8.x has it, 0.7.x doesn't) + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = cursor.fetchall() + col_names = [col[1] for col in columns] + has_uuid_column = "uuid" in col_names + + if has_uuid_column: + cursor.execute("SELECT id, uuid FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {} + for old_id, existing_uuid in records: + if existing_uuid: + # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format) + # (existing UUIDs might be stored with or without dashes in old schema) + id_to_uuid[old_id] = UUID(existing_uuid).hex + else: + # Generate new UUIDv7 (time-ordered) as 32-char hex + id_to_uuid[old_id] = uuid7().hex + else: + # 0.7.x path: no uuid column, generate new UUIDs for all records + cursor.execute("SELECT id FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {old_id: uuid7().hex for (old_id,) in records} + + # Step 3: Copy data with UUIDs as new primary key + cursor.execute("SELECT * FROM core_archiveresult") + old_records = cursor.fetchall() + + # col_names already fetched in Step 2 + inserted_count = 0 + for i, record in enumerate(old_records): + old_id = record[col_names.index("id")] + new_uuid = id_to_uuid[old_id] + + # Build insert with new structure + values = {col_names[i]: record[i] for i in range(len(col_names))} + + # List of fields to copy (all fields from new schema except id, old_id, uuid) + fields_to_copy = [ + "created_at", + "modified_at", + "snapshot_id", + "plugin", + "hook_name", + "status", + "retry_at", + "start_ts", + "end_ts", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "config", + "notes", + "num_uses_succeeded", + "num_uses_failed", + "process_id", + ] + + # Build INSERT statement (only copy fields that exist in source) + existing_fields = [f for f in fields_to_copy if f in values] + + if i == 0: + print(f"[0029] Source columns: {col_names}") + print(f"[0029] Copying fields: {existing_fields}") + + placeholders = ", ".join(["?"] * (len(existing_fields) + 1)) # +1 for id + field_list = "id, " + ", ".join(existing_fields) + + insert_values = [new_uuid] + [values.get(f) for f in existing_fields] + + try: + cursor.execute( + f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", + insert_values, + ) + inserted_count += 1 + except Exception as e: + print(f"[0029] ERROR inserting record {old_id}: {e}") + if i == 0: + print(f"[0029] First record values: {insert_values[:5]}...") + raise + + print(f"[0029] Inserted {inserted_count}/{len(old_records)} records") + + # Step 4: Replace old table with new table + cursor.execute("DROP TABLE core_archiveresult") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult") + + # Step 5: Create indexes + cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") + cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") + cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") + cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") + cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)") + cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)") + + print(f"✓ Migrated {row_count} ArchiveResult records to UUID primary key") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0028_alter_snapshot_fs_version"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + migrate_archiveresult_id_to_uuid, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Remove uuid field (was added in 0025, we're merging it into id) + migrations.RemoveField( + model_name="archiveresult", + name="uuid", + ), + # Change id from AutoField to UUIDField (absorbing the uuid field) + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0030_alter_archiveresult_id.py b/archivebox/core/migrations/0030_alter_archiveresult_id.py new file mode 100644 index 0000000000..398cca9824 --- /dev/null +++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-02 10:02 + +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0029_migrate_archiveresult_to_uuid_pk"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py new file mode 100644 index 0000000000..4d31b51866 --- /dev/null +++ b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py @@ -0,0 +1,16 @@ +# Generated by Codex on 2026-01-21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0030_alter_archiveresult_id"), + ] + + operations = [ + migrations.AddIndex( + model_name="archiveresult", + index=models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), + ), + ] diff --git a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py new file mode 100644 index 0000000000..7883195089 --- /dev/null +++ b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py @@ -0,0 +1,14 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0031_add_archiveresult_snapshot_status_index"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="retry_at", + ), + ] diff --git a/archivebox/core/migrations/0033_alter_archiveresult_status.py b/archivebox/core/migrations/0033_alter_archiveresult_status.py new file mode 100644 index 0000000000..8f2315cd25 --- /dev/null +++ b/archivebox/core/migrations/0033_alter_archiveresult_status.py @@ -0,0 +1,28 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0032_remove_archiveresult_retry_at"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ("noresults", "No Results"), + ], + db_index=True, + default="queued", + max_length=16, + ), + ), + ] diff --git a/tests/mock_server/__init__.py b/archivebox/core/migrations/archivebox/api/migrations/__init__.py similarity index 100% rename from tests/mock_server/__init__.py rename to archivebox/core/migrations/archivebox/api/migrations/__init__.py diff --git a/archivebox/core/migrations/archivebox/crawls/migrations/__init__.py b/archivebox/core/migrations/archivebox/crawls/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/migrations/archivebox/machine/migrations/__init__.py b/archivebox/core/migrations/archivebox/machine/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/mixins.py b/archivebox/core/mixins.py deleted file mode 100644 index 4711dd0e07..0000000000 --- a/archivebox/core/mixins.py +++ /dev/null @@ -1,21 +0,0 @@ -from django.contrib import messages - -from archivebox.search import query_search_index - -class SearchResultsAdminMixin: - def get_search_results(self, request, queryset, search_term: str): - """Enhances the search queryset with results from the search backend""" - - qs, use_distinct = super().get_search_results(request, queryset, search_term) - - search_term = search_term.strip() - if not search_term: - return qs, use_distinct - try: - qsearch = query_search_index(search_term) - qs = qs | qsearch - except Exception as err: - print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') - messages.add_message(request, messages.WARNING, f'Error from the search backend, only showing results from default admin search fields - Error: {err}') - - return qs, use_distinct diff --git a/archivebox/core/models.py b/archivebox/core/models.py old mode 100644 new mode 100755 index 0c9733d066..4256584ff3 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,286 +1,3367 @@ -__package__ = 'archivebox.core' - +__package__ = "archivebox.core" +from typing import Optional, Any, cast +from collections.abc import Iterable, Sequence import uuid -import json +from archivebox.uuid_compat import uuid7 +from datetime import datetime, timedelta +import os +import json from pathlib import Path -from typing import Optional, List + +from statemachine import State, registry from django.db import models +from django.db.models import QuerySet from django.utils.functional import cached_property from django.utils.text import slugify +from django.utils import timezone from django.core.cache import cache -from django.urls import reverse -from django.db.models import Case, When, Value, IntegerField -from django.contrib.auth.models import User # noqa - -from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME -from ..system import get_dir_size -from ..util import parse_date, base_url, hashurl -from ..index.schema import Link -from ..index.html import snapshot_icons -from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE - -EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()] -STATUS_CHOICES = [ - ("succeeded", "succeeded"), - ("failed", "failed"), - ("skipped", "skipped") -] - -try: - JSONField = models.JSONField -except AttributeError: - import jsonfield - JSONField = jsonfield.JSONField - - -class Tag(models.Model): - """ - Based on django-taggit model - """ - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') +from django.urls import reverse_lazy +from django.contrib import admin +from django.conf import settings +from django.utils.safestring import mark_safe + +from archivebox.config import CONSTANTS +from archivebox.misc.system import get_dir_size, atomic_write +from archivebox.misc.util import parse_date, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode +from archivebox.hooks import ( + get_plugins, + get_plugin_name, + get_plugin_icon, +) +from archivebox.base_models.models import ( + ModelWithUUID, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, + get_or_create_system_user_pk, +) +from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine +from archivebox.workers.tasks import bg_archive_snapshot +from archivebox.crawls.models import Crawl +from archivebox.machine.models import Binary - name = models.CharField(unique=True, blank=False, max_length=100) - # slug is autoset on save from name, never set it manually - slug = models.SlugField(unique=True, blank=True, max_length=100) +class Tag(ModelWithUUID): + # Keep AutoField for compatibility with main branch migrations + # Don't use UUIDField here - requires complex FK transformation + id = models.AutoField(primary_key=True, serialize=False, verbose_name="ID") + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=True, + related_name="tag_set", + ) + created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True) + modified_at = models.DateTimeField(auto_now=True) + name = models.CharField(unique=True, blank=False, max_length=100) + slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) + snapshot_set: models.Manager["Snapshot"] - class Meta: + class Meta(ModelWithUUID.Meta): + app_label = "core" verbose_name = "Tag" verbose_name_plural = "Tags" def __str__(self): return self.name - def slugify(self, tag, i=None): - slug = slugify(tag) - if i is not None: - slug += "_%d" % i + def _generate_unique_slug(self) -> str: + base_slug = slugify(self.name) or "tag" + existing = Tag.objects.filter(slug__startswith=base_slug) + if self.pk: + existing = existing.exclude(pk=self.pk) + existing_slugs = set(existing.values_list("slug", flat=True)) + + slug = base_slug + i = 1 + while slug in existing_slugs: + slug = f"{base_slug}_{i}" + i += 1 return slug def save(self, *args, **kwargs): - if self._state.adding and not self.slug: - self.slug = self.slugify(self.name) - - # if name is different but slug conficts with another tags slug, append a counter - # with transaction.atomic(): - slugs = set( - type(self) - ._default_manager.filter(slug__startswith=self.slug) - .values_list("slug", flat=True) - ) + existing_name = None + if self.pk: + existing_name = Tag.objects.filter(pk=self.pk).values_list("name", flat=True).first() + + if not self.slug or existing_name != self.name: + self.slug = self._generate_unique_slug() + super().save(*args, **kwargs) + + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created Tag', + # indent_level=0, + # metadata={ + # 'id': self.id, + # 'name': self.name, + # 'slug': self.slug, + # }, + # ) + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_tag", args=[self.id])) + + def to_json(self) -> dict: + """ + Convert Tag model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + return { + "type": "Tag", + "schema_version": VERSION, + "id": str(self.id), + "name": self.name, + "slug": self.slug, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Tag from JSON dict. + + Args: + record: JSON dict with 'name' field + overrides: Optional dict with 'snapshot' to auto-attach tag + + Returns: + Tag instance or None + """ + name = record.get("name") + if not name: + return None + + tag, _ = Tag.objects.get_or_create(name=name) + + # Auto-attach to snapshot if in overrides + if overrides and "snapshot" in overrides and tag: + overrides["snapshot"].tags.add(tag) + + return tag + + +class SnapshotTag(models.Model): + id = models.AutoField(primary_key=True) + snapshot = models.ForeignKey("Snapshot", db_column="snapshot_id", on_delete=models.CASCADE, to_field="id") + tag = models.ForeignKey(Tag, db_column="tag_id", on_delete=models.CASCADE, to_field="id") + + class Meta: + app_label = "core" + db_table = "core_snapshot_tags" + unique_together = [("snapshot", "tag")] + + +class SnapshotQuerySet(models.QuerySet): + """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc.""" + + # ========================================================================= + # Filtering Methods + # ========================================================================= - i = None - while True: - slug = self.slugify(self.name, i) - if slug not in slugs: - self.slug = slug - return super().save(*args, **kwargs) - i = 1 if i is None else i+1 + FILTER_TYPES = { + "exact": lambda pattern: models.Q(url=pattern), + "substring": lambda pattern: models.Q(url__icontains=pattern), + "regex": lambda pattern: models.Q(url__iregex=pattern), + "domain": lambda pattern: ( + models.Q(url__istartswith=f"http://{pattern}") + | models.Q(url__istartswith=f"https://{pattern}") + | models.Q(url__istartswith=f"ftp://{pattern}") + ), + "tag": lambda pattern: models.Q(tags__name=pattern), + "timestamp": lambda pattern: models.Q(timestamp=pattern), + } + + def filter_by_patterns(self, patterns: list[str], filter_type: str = "exact") -> "SnapshotQuerySet": + """Filter snapshots by URL patterns using specified filter type""" + from archivebox.misc.logging import stderr + + q_filter = models.Q() + for pattern in patterns: + try: + q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern) + except KeyError: + stderr() + stderr(f"[X] Got invalid pattern for --filter-type={filter_type}:", color="red") + stderr(f" {pattern}") + raise SystemExit(2) + return self.filter(q_filter) + + def search(self, patterns: list[str]) -> "SnapshotQuerySet": + """Search snapshots using the configured search backend""" + from archivebox.config.common import SEARCH_BACKEND_CONFIG + from archivebox.search import query_search_index + from archivebox.misc.logging import stderr + + if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: + stderr() + stderr("[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True", color="red") + raise SystemExit(2) + + qsearch = self.none() + for pattern in patterns: + try: + qsearch |= query_search_index(pattern) + except BaseException: + raise SystemExit(2) + return self.all() & qsearch + + # ========================================================================= + # Export Methods + # ========================================================================= + + def to_json(self, with_headers: bool = False) -> str: + """Generate JSON index from snapshots""" + import sys + from datetime import datetime, timezone as tz + from archivebox.config import VERSION + from archivebox.config.common import SERVER_CONFIG + + MAIN_INDEX_HEADER = ( + { + "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", + "schema": "archivebox.index.json", + "copyright_info": SERVER_CONFIG.FOOTER_INFO, + "meta": { + "project": "ArchiveBox", + "version": VERSION, + "git_sha": VERSION, + "website": "https://ArchiveBox.io", + "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", + "source": "https://github.com/ArchiveBox/ArchiveBox", + "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", + "dependencies": {}, + }, + } + if with_headers + else {} + ) + + snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)] + + if with_headers: + output = { + **MAIN_INDEX_HEADER, + "num_links": len(snapshot_dicts), + "updated": datetime.now(tz.utc), + "last_run_cmd": sys.argv, + "links": snapshot_dicts, + } else: - return super().save(*args, **kwargs) + output = snapshot_dicts + return to_json(output, indent=4, sort_keys=True) + + def to_csv(self, cols: list[str] | None = None, header: bool = True, separator: str = ",", ljust: int = 0) -> str: + """Generate CSV output from snapshots""" + cols = cols or ["timestamp", "is_archived", "url"] + header_str = separator.join(col.ljust(ljust) for col in cols) if header else "" + row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500)) + return "\n".join((header_str, *row_strs)) + + def to_html(self, with_headers: bool = True) -> str: + """Generate main index HTML from snapshots""" + from datetime import datetime, timezone as tz + from django.template.loader import render_to_string + from archivebox.config import VERSION + from archivebox.config.common import SERVER_CONFIG + from archivebox.config.version import get_COMMIT_HASH + + template = "static_index.html" if with_headers else "minimal_index.html" + snapshot_list = list(self.iterator(chunk_size=500)) + + return render_to_string( + template, + { + "version": VERSION, + "git_sha": get_COMMIT_HASH() or VERSION, + "num_links": str(len(snapshot_list)), + "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), + "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), + "links": snapshot_list, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + }, + ) -class Snapshot(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) +class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base] + """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods""" - url = models.URLField(unique=True, db_index=True) - timestamp = models.CharField(max_length=32, unique=True, db_index=True) + def filter(self, *args, **kwargs): + domain = kwargs.pop("domain", None) + qs = super().filter(*args, **kwargs) + if domain: + qs = qs.filter(url__icontains=f"://{domain}") + return qs + + def get_queryset(self): + # Don't prefetch by default - it causes "too many open files" during bulk operations + # Views/templates can add .prefetch_related('tags', 'archiveresult_set') where needed + return super().get_queryset() + + # ========================================================================= + # Import Methods + # ========================================================================= + + def remove(self, atomic: bool = False) -> tuple: + """Remove snapshots from the database""" + from django.db import transaction + + if atomic: + with transaction.atomic(): + return self.get_queryset().delete() + return self.get_queryset().delete() + + +class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls + timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) + bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) + crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name="snapshot_set", db_index=True) # type: ignore[assignment] + parent_snapshot = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="child_snapshots", + db_index=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + ) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) + downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) + depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs + fs_version = models.CharField( + max_length=10, + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + ) + current_step = models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ) + + retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) + config = models.JSONField(default=dict, null=False, blank=False, editable=True) + notes = models.TextField(blank=True, null=False, default="") + # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version() + + tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name="snapshot_set", through_fields=("snapshot", "tag")) + + state_machine_name = "archivebox.core.models.SnapshotMachine" + state_field_name = "status" + retry_at_field_name = "retry_at" + StatusChoices = ModelWithStateMachine.StatusChoices + active_state = StatusChoices.STARTED + + crawl_id: uuid.UUID + parent_snapshot_id: uuid.UUID | None + _prefetched_objects_cache: dict[str, Any] + + objects = SnapshotManager() + archiveresult_set: models.Manager["ArchiveResult"] + + class Meta( + ModelWithOutputDir.Meta, + ModelWithConfig.Meta, + ModelWithNotes.Meta, + ModelWithHealthStats.Meta, + ModelWithStateMachine.Meta, + ): + app_label = "core" + verbose_name = "Snapshot" + verbose_name_plural = "Snapshots" + constraints = [ + # Allow same URL in different crawls, but not duplicates within same crawl + models.UniqueConstraint(fields=["url", "crawl"], name="unique_url_per_crawl"), + # Global timestamp uniqueness for 1:1 symlink mapping + models.UniqueConstraint(fields=["timestamp"], name="unique_timestamp"), + ] + + def __str__(self): + return f"[{self.id}] {self.url[:64]}" - added = models.DateTimeField(auto_now_add=True, db_index=True) - updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) - tags = models.ManyToManyField(Tag, blank=True) + @property + def created_by(self): + """Convenience property to access the user who created this snapshot via its crawl.""" + return self.crawl.created_by - keys = ('url', 'timestamp', 'title', 'tags', 'updated') + @property + def process_set(self): + """Get all Process objects related to this snapshot's ArchiveResults.""" + from archivebox.machine.models import Process - def __repr__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + return Process.objects.filter(archiveresult__snapshot_id=self.id) - def __str__(self) -> str: - title = self.title or '-' - return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' + @property + def binary_set(self): + """Get all Binary objects used by processes related to this snapshot.""" + from archivebox.machine.models import Binary + + return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct() + + def save(self, *args, **kwargs): + if not self.bookmarked_at: + self.bookmarked_at = self.created_at or timezone.now() + if not self.timestamp: + self.timestamp = str(self.bookmarked_at.timestamp()) + + # Migrate filesystem if needed (happens automatically on save) + if self.pk and self.fs_migration_needed: + print( + f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}", + ) + # Walk through migration chain automatically + current = self.fs_version + target = self._fs_current_version() + + while current != target: + next_ver = self._fs_next_version(current) + method = f"_fs_migrate_from_{current.replace('.', '_')}_to_{next_ver.replace('.', '_')}" + + # Only run if method exists (most are no-ops) + if hasattr(self, method): + print(f"[DEBUG save()] Running {method}()") + getattr(self, method)() + + current = next_ver + + # Update version + self.fs_version = target + + super().save(*args, **kwargs) + self.ensure_legacy_archive_symlink() + existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url} + if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls: + self.crawl.urls += f"\n{self.url}" + self.crawl.save() + + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created Snapshot', + # indent_level=2, + # url=self.url, + # metadata={ + # 'id': str(self.id), + # 'crawl_id': str(self.crawl_id), + # 'depth': self.depth, + # 'status': self.status, + # }, + # ) + + # ========================================================================= + # Filesystem Migration Methods + # ========================================================================= + + @staticmethod + def _fs_current_version() -> str: + """Get current ArchiveBox filesystem version (normalized to x.x.0 format)""" + from archivebox.config import VERSION + + # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0") + parts = VERSION.split(".") + if len(parts) >= 2: + major, minor = parts[0], parts[1] + # Strip any non-numeric suffix from minor version + minor = "".join(c for c in minor if c.isdigit()) + return f"{major}.{minor}.0" + return "0.9.0" # Fallback if version parsing fails + + @property + def fs_migration_needed(self) -> bool: + """Check if snapshot needs filesystem migration""" + return self.fs_version != self._fs_current_version() + + def _fs_next_version(self, version: str) -> str: + """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)""" + # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp}) + if version in ("0.7.0", "0.8.0"): + return "0.9.0" + return self._fs_current_version() + + def _fs_migrate_from_0_8_0_to_0_9_0(self): + """ + Migrate from flat to nested structure. + + 0.8.x: archive/{timestamp}/ + 0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/ + + Transaction handling: + 1. Copy files INSIDE transaction + 2. Convert index.json to index.jsonl INSIDE transaction + 3. Create symlink INSIDE transaction + 4. Update fs_version INSIDE transaction (done by save()) + 5. Exit transaction (DB commit) + 6. Delete old files OUTSIDE transaction (after commit) + """ + import shutil + from django.db import transaction + + old_dir = self.get_storage_path_for_version("0.8.0") + new_dir = self.get_storage_path_for_version("0.9.0") + + print( + f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}", + ) + + if not old_dir.exists() or old_dir == new_dir: + # No migration needed + print("[DEBUG _fs_migrate] Returning None (early return)") + return None + + if new_dir.exists(): + # New directory already exists (files already copied), but we still need cleanup + # Return cleanup info so old directory can be cleaned up + print("[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)") + return (old_dir, new_dir) + + new_dir.mkdir(parents=True, exist_ok=True) + + # Copy all files (idempotent), skipping index.json (will be converted to jsonl) + for old_file in old_dir.rglob("*"): + if not old_file.is_file(): + continue + + rel_path = old_file.relative_to(old_dir) + new_file = new_dir / rel_path + + # Skip if already copied + if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size: + continue + + new_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(old_file, new_file) + + # Verify all copied + old_files = {f.relative_to(old_dir): f.stat().st_size for f in old_dir.rglob("*") if f.is_file()} + new_files = {f.relative_to(new_dir): f.stat().st_size for f in new_dir.rglob("*") if f.is_file()} + + if old_files.keys() != new_files.keys(): + missing = old_files.keys() - new_files.keys() + raise Exception(f"Migration incomplete: missing {missing}") + + # Convert index.json to index.jsonl in the new directory + self.convert_index_json_to_jsonl() + + # Schedule cleanup AFTER transaction commits successfully + # This ensures DB changes are committed before we delete old files + transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir)) + + # Return cleanup info for manual cleanup if needed (when called directly) + return (old_dir, new_dir) + + def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path): + """ + Delete old directory and create symlink after successful migration. + """ + import shutil + import logging + + # Delete old directory + if old_dir.exists() and not old_dir.is_symlink(): + try: + shutil.rmtree(old_dir) + except Exception as e: + logging.getLogger("archivebox.migration").warning( + f"Could not remove old migration directory {old_dir}: {e}", + ) + return # Don't create symlink if cleanup failed + + # Create backwards-compat symlink (after old dir is deleted) + symlink_path = old_dir # Same path as old_dir + if symlink_path.is_symlink(): + symlink_path.unlink() + + if not symlink_path.exists(): + try: + symlink_path.symlink_to(new_dir, target_is_directory=True) + except Exception as e: + logging.getLogger("archivebox.migration").warning( + f"Could not create symlink from {symlink_path} to {new_dir}: {e}", + ) + + # ========================================================================= + # Path Calculation and Migration Helpers + # ========================================================================= + + @staticmethod + def extract_domain_from_url(url: str) -> str: + """ + Extract domain from URL for 0.9.x path structure. + Uses full hostname with sanitized special chars. + + Examples: + https://example.com:8080 → example.com_8080 + https://sub.example.com → sub.example.com + file:///path → localhost + data:text/html → data + """ + from urllib.parse import urlparse + + try: + parsed = urlparse(url) + + if parsed.scheme in ("http", "https"): + if parsed.port: + return f"{parsed.hostname}_{parsed.port}".replace(":", "_") + return parsed.hostname or "unknown" + elif parsed.scheme == "file": + return "localhost" + elif parsed.scheme: + return parsed.scheme + else: + return "unknown" + except Exception: + return "unknown" + + def get_storage_path_for_version(self, version: str) -> Path: + """ + Calculate storage path for specific filesystem version. + Centralizes path logic so it's reusable. + + 0.7.x/0.8.x: archive/{timestamp} + 0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ + """ + from datetime import datetime + + if version in ("0.7.0", "0.8.0"): + return CONSTANTS.ARCHIVE_DIR / self.timestamp + + elif version in ("0.9.0", "1.0.0"): + username = self.created_by.username + + # Use created_at for date grouping (fallback to timestamp) + if self.created_at: + date_str = self.created_at.strftime("%Y%m%d") + else: + date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d") + + domain = self.extract_domain_from_url(self.url) + + return CONSTANTS.DATA_DIR / "users" / username / "snapshots" / date_str / domain / str(self.id) + else: + # Unknown version - use current + return self.get_storage_path_for_version(self._fs_current_version()) + + # ========================================================================= + # Loading and Creation from Filesystem (Used by archivebox update ONLY) + # ========================================================================= @classmethod - def from_json(cls, info: dict): - info = {k: v for k, v in info.items() if k in cls.keys} - return cls(**info) + def load_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: + """ + Load existing Snapshot from DB by reading index.jsonl or index.json. - def as_json(self, *args) -> dict: - args = args or self.keys - return { - key: getattr(self, key) - if key != 'tags' else self.tags_str() - for key in args - } + Reads index file, extracts url+timestamp, queries DB. + Returns existing Snapshot or None if not found/invalid. + Does NOT create new snapshots. - def as_link(self) -> Link: - return Link.from_json(self.as_json()) + ONLY used by: archivebox update (for orphan detection) + """ + from archivebox.machine.models import Process - def as_link_with_details(self) -> Link: - from ..index import load_link_details - return load_link_details(self.as_link()) + # Try index.jsonl first (new format), then index.json (legacy) + jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME + json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME - def tags_str(self, nocache=True) -> str: - cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' - calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) - if nocache: - tags_str = calc_tags_str() - cache.set(cache_key, tags_str) - return tags_str - return cache.get_or_set(cache_key, calc_tags_str) + data = None + if jsonl_path.exists(): + try: + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get("type") == "Snapshot": + data = record + break + except OSError: + pass + elif json_path.exists(): + try: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + pass - def icons(self) -> str: - return snapshot_icons(self) + if not data: + return None - @cached_property - def extension(self) -> str: - from ..util import extension - return extension(self.url) + url = data.get("url") + if not url: + return None - @cached_property - def bookmarked(self): - return parse_date(self.timestamp) + # Get timestamp - prefer index file, fallback to folder name + timestamp = cls._select_best_timestamp( + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, + ) - @cached_property - def bookmarked_date(self): - # TODO: remove this - return self.bookmarked + if not timestamp: + return None - @cached_property - def is_archived(self): - return self.as_link().is_archived + # Look up existing (try exact match first, then fuzzy match for truncated timestamps) + try: + snapshot = cls.objects.get(url=url, timestamp=timestamp) + print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}") + return snapshot + except cls.DoesNotExist: + print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}") + # Try fuzzy match - index.json may have truncated timestamp + # e.g., index has "1767000340" but DB has "1767000340.624737" + candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp) + if candidates.count() == 1: + snapshot = candidates.first() + if snapshot is None: + return None + print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}") + return snapshot + elif candidates.count() > 1: + print("[DEBUG load_from_directory] Multiple fuzzy matches, using first") + return candidates.first() + print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}") + return None + except cls.MultipleObjectsReturned: + # Should not happen with unique constraint + print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}") + return cls.objects.filter(url=url, timestamp=timestamp).first() - @cached_property - def num_outputs(self): - return self.archiveresult_set.filter(status='succeeded').count() + @classmethod + def create_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: + """ + Create new Snapshot from orphaned directory. - @cached_property - def url_hash(self): - return hashurl(self.url) + Validates timestamp, ensures uniqueness. + Returns new UNSAVED Snapshot or None if invalid. - @cached_property - def base_url(self): - return base_url(self.url) + ONLY used by: archivebox update (for orphan import) + """ + from archivebox.machine.models import Process - @cached_property - def link_dir(self): - return str(ARCHIVE_DIR / self.timestamp) + # Try index.jsonl first (new format), then index.json (legacy) + jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME + json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME - @cached_property - def archive_path(self): - return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) + data = None + if jsonl_path.exists(): + try: + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get("type") == "Snapshot": + data = record + break + except OSError: + pass + elif json_path.exists(): + try: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + pass - @cached_property - def archive_size(self): - cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' + if not data: + return None + + url = data.get("url") + if not url: + return None + + # Get and validate timestamp + timestamp = cls._select_best_timestamp( + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, + ) + + if not timestamp: + return None + + # Ensure uniqueness (reuses existing logic from create_or_update_from_dict) + timestamp = cls._ensure_unique_timestamp(url, timestamp) - def calc_dir_size(): + # Detect version + fs_version = cls._detect_fs_version_from_index(data) + + # Get or create catchall crawl for orphaned snapshots + from archivebox.crawls.models import Crawl + + system_user_id = get_or_create_system_user_pk() + catchall_crawl, _ = Crawl.objects.get_or_create( + label="[migration] orphaned snapshots", + defaults={ + "urls": f"# Orphaned snapshot: {url}", + "max_depth": 0, + "created_by_id": system_user_id, + }, + ) + + return cls( + url=url, + timestamp=timestamp, + title=data.get("title", ""), + fs_version=fs_version, + crawl=catchall_crawl, + ) + + @staticmethod + def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> str | None: + """ + Select best timestamp from index.json vs folder name. + + Validates range (1995-2035). + Prefers index.json if valid. + """ + + def is_valid_timestamp(ts: object | None) -> bool: + if not isinstance(ts, (str, int, float)): + return False try: - return get_dir_size(self.link_dir)[0] - except Exception: - return 0 + ts_int = int(float(ts)) + # 1995-01-01 to 2035-12-31 + return 788918400 <= ts_int <= 2082758400 + except (TypeError, ValueError, OverflowError): + return False - return cache.get_or_set(cache_key, calc_dir_size) + index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False + folder_valid = is_valid_timestamp(folder_name) - @cached_property - def thumbnail_url(self) -> Optional[str]: - result = self.archiveresult_set.filter( - extractor='screenshot', - status='succeeded' - ).only('output').last() - if result: - return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) + if index_valid and index_timestamp is not None: + return str(int(float(str(index_timestamp)))) + if folder_valid: + return str(int(float(str(folder_name)))) return None - @cached_property - def headers(self) -> Optional[dict]: + @classmethod + def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str: + """ + Ensure timestamp is globally unique. + If collision with different URL, increment by 1 until unique. + + NOTE: Logic already exists in create_or_update_from_dict (line 266-267) + This is just an extracted, reusable version. + """ + while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists(): + timestamp = str(int(float(timestamp)) + 1) + return timestamp + + @staticmethod + def _detect_fs_version_from_index(data: dict) -> str: + """ + Detect fs_version from index.json structure. + + - Has fs_version field: use it + - Has history dict: 0.7.0 + - Has archive_results list: 0.8.0 + - Default: 0.7.0 + """ + if "fs_version" in data: + return data["fs_version"] + if "history" in data and "archive_results" not in data: + return "0.7.0" + if "archive_results" in data: + return "0.8.0" + return "0.7.0" + + # ========================================================================= + # Index.json Reconciliation + # ========================================================================= + + def reconcile_with_index(self): + """ + Merge index.json/index.jsonl with DB. DB is source of truth. + + - Title: longest non-URL + - Tags: union + - ArchiveResults: keep both (by plugin+start_ts) + + Converts index.json to index.jsonl if needed, then writes back in JSONL format. + + Used by: archivebox update (to sync index with DB) + """ + import json + + # Try to convert index.json to index.jsonl first + self.convert_index_json_to_jsonl() + + # Check for index.jsonl (preferred) or index.json (legacy) + jsonl_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME + json_path = Path(self.output_dir) / CONSTANTS.JSON_INDEX_FILENAME + + index_data = {} + + if jsonl_path.exists(): + # Read from JSONL format + jsonl_data = self.read_index_jsonl() + if jsonl_data["snapshot"]: + index_data = jsonl_data["snapshot"] + # Convert archive_results list to expected format + index_data["archive_results"] = jsonl_data["archive_results"] + elif json_path.exists(): + # Fallback to legacy JSON format + try: + with open(json_path) as f: + index_data = json.load(f) + except (OSError, TypeError, ValueError, json.JSONDecodeError): + pass + + # Merge title + self._merge_title_from_index(index_data) + + # Merge tags + self._merge_tags_from_index(index_data) + + # Merge ArchiveResults + self._merge_archive_results_from_index(index_data) + + # Write back in JSONL format + self.write_index_jsonl() + + def reconcile_with_index_json(self): + """Deprecated: use reconcile_with_index() instead.""" + return self.reconcile_with_index() + + def _merge_title_from_index(self, index_data: dict): + """Merge title - prefer longest non-URL title.""" + index_title = (index_data.get("title") or "").strip() + db_title = self.title or "" + + candidates = [t for t in [index_title, db_title] if t and t != self.url] + if candidates: + best_title = max(candidates, key=len) + if self.title != best_title: + self.title = best_title + + def _merge_tags_from_index(self, index_data: dict): + """Merge tags - union of both sources.""" + from django.db import transaction + + index_tags = set(index_data.get("tags", "").split(",")) if index_data.get("tags") else set() + index_tags = {t.strip() for t in index_tags if t.strip()} + + db_tags = set(self.tags.values_list("name", flat=True)) + + new_tags = index_tags - db_tags + if new_tags: + with transaction.atomic(): + for tag_name in new_tags: + tag, _ = Tag.objects.get_or_create(name=tag_name) + self.tags.add(tag) + + def _merge_archive_results_from_index(self, index_data: dict): + """Merge ArchiveResults - keep both (by plugin+start_ts).""" + existing = {(ar.plugin, ar.start_ts): ar for ar in ArchiveResult.objects.filter(snapshot=self)} + + # Handle 0.8.x format (archive_results list) + for result_data in index_data.get("archive_results", []): + self._create_archive_result_if_missing(result_data, existing) + + # Handle 0.7.x format (history dict) + if "history" in index_data and isinstance(index_data["history"], dict): + for plugin, result_list in index_data["history"].items(): + if isinstance(result_list, list): + for result_data in result_list: + # Support both old 'extractor' and new 'plugin' keys for backwards compat + result_data["plugin"] = result_data.get("plugin") or result_data.get("extractor") or plugin + self._create_archive_result_if_missing(result_data, existing) + + def _create_archive_result_if_missing(self, result_data: dict, existing: dict): + """Create ArchiveResult if not already in DB.""" + from dateutil import parser + + # Support both old 'extractor' and new 'plugin' keys for backwards compat + plugin = result_data.get("plugin") or result_data.get("extractor", "") + if not plugin: + return + + start_ts = None + if result_data.get("start_ts"): + try: + start_ts = parser.parse(result_data["start_ts"]) + except (TypeError, ValueError, OverflowError): + pass + + if (plugin, start_ts) in existing: + return + try: - return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip()) + end_ts = None + if result_data.get("end_ts"): + try: + end_ts = parser.parse(result_data["end_ts"]) + except (TypeError, ValueError, OverflowError): + pass + + # Support both 'output' (legacy) and 'output_str' (new JSONL) field names + output_str = result_data.get("output_str") or result_data.get("output", "") + + ArchiveResult.objects.create( + snapshot=self, + plugin=plugin, + hook_name=result_data.get("hook_name", ""), + status=result_data.get("status", "failed"), + output_str=output_str, + cmd=result_data.get("cmd", []), + pwd=result_data.get("pwd", str(self.output_dir)), + start_ts=start_ts, + end_ts=end_ts, + ) except Exception: pass - return None - @cached_property - def status_code(self) -> Optional[str]: - return self.headers and self.headers.get('Status-Code') + def write_index_json(self): + """Write index.json in 0.9.x format (deprecated, use write_index_jsonl).""" + import json - @cached_property - def history(self) -> dict: - # TODO: use ArchiveResult for this instead of json - return self.as_link_with_details().history + index_path = Path(self.output_dir) / "index.json" + + data = { + "url": self.url, + "timestamp": self.timestamp, + "title": self.title or "", + "tags": ",".join(sorted(self.tags.values_list("name", flat=True))), + "fs_version": self.fs_version, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "archive_results": [ + { + "plugin": ar.plugin, + "status": ar.status, + "start_ts": ar.start_ts.isoformat() if ar.start_ts else None, + "end_ts": ar.end_ts.isoformat() if ar.end_ts else None, + "output": ar.output_str or "", + "cmd": ar.cmd if isinstance(ar.cmd, list) else [], + "pwd": ar.pwd, + } + for ar in ArchiveResult.objects.filter(snapshot=self).order_by("start_ts") + ], + } + + index_path.parent.mkdir(parents=True, exist_ok=True) + with open(index_path, "w") as f: + json.dump(data, f, indent=2, sort_keys=True) + + def write_index_jsonl(self): + """ + Write index.jsonl in flat JSONL format. + + Each line is a JSON record with a 'type' field: + - Snapshot: snapshot metadata (crawl_id, url, tags, etc.) + - ArchiveResult: extractor results (plugin, status, output, etc.) + - Binary: binary info used for the extraction + - Process: process execution details (cmd, exit_code, timing, etc.) + """ + import json + + index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME + index_path.parent.mkdir(parents=True, exist_ok=True) + + # Track unique binaries and processes to avoid duplicates + binaries_seen = set() + processes_seen = set() + + with open(index_path, "w") as f: + # Write Snapshot record first (to_json includes crawl_id, fs_version) + f.write(json.dumps(self.to_json()) + "\n") + + # Write ArchiveResult records with their associated Binary and Process + # Use select_related to optimize queries + for ar in self.archiveresult_set.select_related("process__binary").order_by("start_ts"): + # Write Binary record if not already written + if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen: + binaries_seen.add(ar.process.binary_id) + f.write(json.dumps(ar.process.binary.to_json()) + "\n") + + # Write Process record if not already written + if ar.process and ar.process_id not in processes_seen: + processes_seen.add(ar.process_id) + f.write(json.dumps(ar.process.to_json()) + "\n") + + # Write ArchiveResult record + f.write(json.dumps(ar.to_json()) + "\n") + + def read_index_jsonl(self) -> dict: + """ + Read index.jsonl and return parsed records grouped by type. + + Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes' + """ + from archivebox.machine.models import Process + from archivebox.misc.jsonl import ( + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_BINARYREQUEST, + TYPE_BINARY, + TYPE_PROCESS, + ) + + index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME + result: dict[str, Any] = { + "snapshot": None, + "archive_results": [], + "binaries": [], + "processes": [], + } + + if not index_path.exists(): + return result + + records = Process.parse_records_from_text(index_path.read_text()) + for record in records: + record_type = record.get("type") + if record_type == TYPE_SNAPSHOT: + result["snapshot"] = record + elif record_type == TYPE_ARCHIVERESULT: + result["archive_results"].append(record) + elif record_type in {TYPE_BINARYREQUEST, TYPE_BINARY}: + result["binaries"].append(record) + elif record_type == TYPE_PROCESS: + result["processes"].append(record) + + return result + + def convert_index_json_to_jsonl(self) -> bool: + """ + Convert index.json to index.jsonl format. + + Reads existing index.json, creates index.jsonl, and removes index.json. + Returns True if conversion was performed, False if no conversion needed. + """ + import json + + json_path = Path(self.output_dir) / CONSTANTS.JSON_INDEX_FILENAME + jsonl_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME + + # Skip if already converted or no json file exists + if jsonl_path.exists() or not json_path.exists(): + return False - @cached_property - def latest_title(self) -> Optional[str]: - if self.title: - return self.title # whoopdedoo that was easy - try: - # take longest successful title from ArchiveResult db history - return sorted( - self.archiveresult_set\ - .filter(extractor='title', status='succeeded', output__isnull=False)\ - .values_list('output', flat=True), - key=lambda r: len(r), - )[-1] - except IndexError: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + return False + + # Detect format version and extract records + fs_version = data.get("fs_version", "0.7.0") + + jsonl_path.parent.mkdir(parents=True, exist_ok=True) + with open(jsonl_path, "w") as f: + # Write Snapshot record + snapshot_record = { + "type": "Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id) if self.crawl_id else None, + "url": data.get("url", self.url), + "timestamp": data.get("timestamp", self.timestamp), + "title": data.get("title", self.title or ""), + "tags": data.get("tags", ""), + "fs_version": fs_version, + "bookmarked_at": data.get("bookmarked_at"), + "created_at": data.get("created_at"), + } + f.write(json.dumps(snapshot_record) + "\n") + + # Handle 0.8.x/0.9.x format (archive_results list) + for result_data in data.get("archive_results", []): + ar_record = { + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin", ""), + "status": result_data.get("status", ""), + "output_str": result_data.get("output", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), + } + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + f.write(json.dumps(ar_record) + "\n") + + # Handle 0.7.x format (history dict) + if "history" in data and isinstance(data["history"], dict): + for plugin, result_list in data["history"].items(): + if not isinstance(result_list, list): + continue + for result_data in result_list: + ar_record = { + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin") or result_data.get("extractor") or plugin, + "status": result_data.get("status", ""), + "output_str": result_data.get("output", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), + } + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + f.write(json.dumps(ar_record) + "\n") + + # Remove old index.json after successful conversion + try: + json_path.unlink() + except OSError: pass + return True + + # ========================================================================= + # Snapshot Utilities + # ========================================================================= + + @staticmethod + def move_directory_to_invalid(snapshot_dir: Path): + """ + Move invalid directory to data/invalid/YYYYMMDD/. + + Used by: archivebox update (when encountering invalid directories) + """ + from datetime import datetime + import shutil + + invalid_dir = CONSTANTS.DATA_DIR / "invalid" / datetime.now().strftime("%Y%m%d") + invalid_dir.mkdir(parents=True, exist_ok=True) + + dest = invalid_dir / snapshot_dir.name + counter = 1 + while dest.exists(): + dest = invalid_dir / f"{snapshot_dir.name}_{counter}" + counter += 1 + try: - # take longest successful title from Link json index file history - return sorted( - ( - result.output.strip() - for result in self.history['title'] - if result.status == 'succeeded' and result.output.strip() - ), - key=lambda r: len(r), - )[-1] - except (KeyError, IndexError): + shutil.move(str(snapshot_dir), str(dest)) + except Exception: pass - return None + @classmethod + def find_and_merge_duplicates(cls) -> int: + """ + Find and merge snapshots with same url:timestamp. + Returns count of duplicate sets merged. - def save_tags(self, tags: List[str]=()) -> None: - tags_id = [] - for tag in tags: - if tag.strip(): - tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) - self.tags.clear() - self.tags.add(*tags_id) + Used by: archivebox update (Phase 3: deduplication) + """ + from django.db.models import Count + + duplicates = cls.objects.values("url", "timestamp").annotate(count=Count("id")).filter(count__gt=1) + merged = 0 + for dup in duplicates.iterator(chunk_size=500): + snapshots = list( + cls.objects.filter(url=dup["url"], timestamp=dup["timestamp"]).order_by("created_at"), # Keep oldest + ) -class ArchiveResultManager(models.Manager): - def indexable(self, sorted: bool = True): - INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') + if len(snapshots) > 1: + try: + cls._merge_snapshots(snapshots) + merged += 1 + except Exception: + pass - if sorted: - precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] - qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') - return qs + return merged + @classmethod + def _merge_snapshots(cls, snapshots: Sequence["Snapshot"]): + """ + Merge exact duplicates. + Keep oldest, union files + ArchiveResults. + """ + import shutil -class ArchiveResult(models.Model): - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - uuid = models.UUIDField(default=uuid.uuid4, editable=False) + keeper = snapshots[0] + duplicates = snapshots[1:] - snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) - extractor = models.CharField(choices=EXTRACTORS, max_length=32) - cmd = JSONField() - pwd = models.CharField(max_length=256) - cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) - output = models.CharField(max_length=1024) - start_ts = models.DateTimeField(db_index=True) - end_ts = models.DateTimeField() - status = models.CharField(max_length=16, choices=STATUS_CHOICES) + keeper_dir = Path(keeper.output_dir) - objects = ArchiveResultManager() + for dup in duplicates: + dup_dir = Path(dup.output_dir) - def __str__(self): - return self.extractor + # Merge files + if dup_dir.exists() and dup_dir != keeper_dir: + for dup_file in dup_dir.rglob("*"): + if not dup_file.is_file(): + continue + + rel = dup_file.relative_to(dup_dir) + keeper_file = keeper_dir / rel + + if not keeper_file.exists(): + keeper_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(dup_file, keeper_file) + + try: + shutil.rmtree(dup_dir) + except Exception: + pass + + # Merge tags + for tag in dup.tags.all(): + keeper.tags.add(tag) + + # Move ArchiveResults + ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper) + + # Delete + dup.delete() + + # ========================================================================= + # Output Directory Properties + # ========================================================================= + + @property + def output_dir_parent(self) -> str: + return "archive" + + @property + def output_dir_name(self) -> str: + return str(self.timestamp) + + def archive(self, overwrite=False, methods=None): + return bg_archive_snapshot(self, overwrite=overwrite, methods=methods) + + @admin.display(description="Tags") + def tags_str(self, nocache=True) -> str | None: + calc_tags_str = lambda: ",".join(sorted(tag.name for tag in self.tags.all())) + prefetched_cache = getattr(self, "_prefetched_objects_cache", {}) + if "tags" in prefetched_cache: + return calc_tags_str() + cache_key = f"{self.pk}-tags" + return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() + + def icons(self, path: str | None = None) -> str: + """Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" + from django.utils.html import format_html + + cache_key = ( + f"result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}" + ) + + def calc_icons(): + prefetched_cache = getattr(self, "_prefetched_objects_cache", {}) + if "archiveresult_set" in prefetched_cache: + archive_results = { + r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str) + } + else: + # Filter for results that have either output_files or output_str + from django.db.models import Q + + archive_results = { + r.plugin: r + for r in self.archiveresult_set.filter( + Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str="")), + ) + } + + archive_path = path or self.archive_path + output = "" + output_template = '{}' + + # Get all plugins from hooks system (sorted by numeric prefix) + all_plugins = [get_plugin_name(e) for e in get_plugins()] + + for plugin in all_plugins: + result = archive_results.get(plugin) + existing = result and result.status == "succeeded" and (result.output_files or result.output_str) + icon = mark_safe(get_plugin_icon(plugin)) + + # Skip plugins with empty icons that have no output + # (e.g., staticfile only shows when there's actual output) + if not icon.strip() and not existing: + continue + + embed_path = result.embed_path() if result else f"{plugin}/" + output += format_html( + output_template, + archive_path, + embed_path, + str(bool(existing)), + plugin, + icon, + ) + + return format_html( + '{}', + mark_safe(output), + ) + + cache_result = cache.get(cache_key) + if cache_result: + return cache_result + + fresh_result = calc_icons() + cache.set(cache_key, fresh_result, timeout=60 * 60 * 24) + return fresh_result + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_snapshot", args=[self.id])) + + def get_absolute_url(self): + return f"/{self.archive_path}" + + @cached_property + def domain(self) -> str: + return url_domain(self.url) + + @property + def title_stripped(self) -> str: + return (self.title or "").strip() + + @staticmethod + def _normalize_title_candidate(candidate: str | None, *, snapshot_url: str) -> str: + title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip() + if not title: + return "" + if title.lower() in {"pending...", "no title found"}: + return "" + if title == snapshot_url: + return "" + if title.startswith(("http://", "https://")): + return "" + if "/" in title and title.lower().endswith(".txt"): + return "" + return title + + @property + def resolved_title(self) -> str: + stored_title = self._normalize_title_candidate(self.title, snapshot_url=self.url) + if stored_title: + return stored_title + + title_result = ( + self.archiveresult_set.filter(plugin="title").exclude(output_str="").order_by("-start_ts", "-end_ts", "-created_at").first() + ) + if title_result: + result_title = self._normalize_title_candidate(title_result.output_str, snapshot_url=self.url) + if result_title: + return result_title + + title_file = self.output_dir / "title" / "title.txt" + if title_file.exists(): + try: + file_title = self._normalize_title_candidate(title_file.read_text(encoding="utf-8"), snapshot_url=self.url) + except OSError: + file_title = "" + if file_title: + return file_title + + return "" + + @cached_property + def hashes_index(self) -> dict[str, dict[str, Any]]: + hashes_path = self.output_dir / "hashes" / "hashes.json" + if not hashes_path.exists(): + return {} + + try: + data = json.loads(hashes_path.read_text(encoding="utf-8")) + except Exception: + return {} + + index: dict[str, dict[str, Any]] = {} + if isinstance(data, dict) and isinstance(data.get("files"), list): + for entry in data["files"]: + if not isinstance(entry, dict): + continue + path = str(entry.get("path") or "").strip().rstrip("/") + if not path: + continue + index[path] = { + "size": entry.get("size") or entry.get("num_bytes") or entry.get("bytes") or 0, + "is_dir": bool(entry.get("is_dir")) or str(entry.get("path") or "").endswith("/"), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + elif isinstance(data, dict): + for path, entry in data.items(): + if not isinstance(entry, dict) or path == ".": + continue + clean_path = str(path).rstrip("/") + if not clean_path: + continue + index[clean_path] = { + "size": entry.get("size") or entry.get("num_bytes") or 0, + "is_dir": bool(entry.get("mime_type") == "inode/directory" or str(path).endswith("/")), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + return index + + @property + def output_dir(self) -> Path: + """The filesystem path to the snapshot's output directory.""" + import os + + current_path = self.get_storage_path_for_version(self.fs_version) + + if current_path.exists(): + return current_path + + # Check for backwards-compat symlink + old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + if old_path.is_symlink(): + link_target = Path(os.readlink(old_path)) + return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve() + elif old_path.exists(): + return old_path + + return current_path + + def ensure_legacy_archive_symlink(self) -> None: + """Ensure the legacy archive/ path resolves to this snapshot.""" + import os + + legacy_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + target = Path(self.get_storage_path_for_version(self._fs_current_version())) + + if target == legacy_path: + return + + legacy_path.parent.mkdir(parents=True, exist_ok=True) + + if legacy_path.exists() or legacy_path.is_symlink(): + if legacy_path.is_symlink(): + try: + if legacy_path.resolve() == target.resolve(): + return + except OSError: + pass + legacy_path.unlink(missing_ok=True) + else: + return + + rel_target = os.path.relpath(target, legacy_path.parent) + try: + legacy_path.symlink_to(rel_target, target_is_directory=True) + except OSError: + return + + def ensure_crawl_symlink(self) -> None: + """Ensure snapshot is symlinked under its crawl output directory.""" + import os + from pathlib import Path + from django.utils import timezone + from archivebox import DATA_DIR + from archivebox.crawls.models import Crawl + + if not self.crawl_id: + return + crawl = Crawl.objects.filter(id=self.crawl_id).select_related("created_by").first() + if not crawl: + return + + date_base = crawl.created_at or self.created_at or timezone.now() + date_str = date_base.strftime("%Y%m%d") + domain = self.extract_domain_from_url(self.url) + username = crawl.created_by.username if getattr(crawl, "created_by_id", None) else "system" + + crawl_dir = DATA_DIR / "users" / username / "crawls" / date_str / domain / str(crawl.id) + link_path = crawl_dir / "snapshots" / domain / str(self.id) + link_parent = link_path.parent + link_parent.mkdir(parents=True, exist_ok=True) + + target = Path(self.output_dir) + if link_path.exists() or link_path.is_symlink(): + if link_path.is_symlink(): + if link_path.resolve() == target.resolve(): + return + link_path.unlink(missing_ok=True) + else: + return + + rel_target = os.path.relpath(target, link_parent) + try: + link_path.symlink_to(rel_target, target_is_directory=True) + except OSError: + return + + @cached_property + def legacy_archive_path(self) -> str: + return f"{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}" + + @cached_property + def archive_path_from_db(self) -> str: + """Best-effort public URL path derived from DB fields only.""" + if self.fs_version in ("0.7.0", "0.8.0"): + return self.legacy_archive_path + + if self.fs_version in ("0.9.0", "1.0.0"): + username = "web" + crawl = getattr(self, "crawl", None) + if crawl and getattr(crawl, "created_by_id", None): + username = crawl.created_by.username + if username == "system": + username = "web" + + date_base = self.created_at or self.bookmarked_at + if date_base: + date_str = date_base.strftime("%Y%m%d") + else: + try: + date_str = datetime.fromtimestamp(float(self.timestamp)).strftime("%Y%m%d") + except (TypeError, ValueError, OSError): + return self.legacy_archive_path + + domain = self.extract_domain_from_url(self.url) + return f"{username}/{date_str}/{domain}/{self.id}" + + return self.legacy_archive_path + + @cached_property + def url_path(self) -> str: + """URL path matching the current snapshot output_dir layout.""" + try: + rel_path = Path(self.output_dir).resolve().relative_to(CONSTANTS.DATA_DIR) + except Exception: + return self.legacy_archive_path + + parts = rel_path.parts + # New layout: users//snapshots//// + if len(parts) >= 6 and parts[0] == "users" and parts[2] == "snapshots": + username = parts[1] + if username == "system": + username = "web" + date_str = parts[3] + domain = parts[4] + snapshot_id = parts[5] + return f"{username}/{date_str}/{domain}/{snapshot_id}" + + # Legacy layout: archive// + if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME: + return f"{parts[0]}/{parts[1]}" + + return "/".join(parts) + + @cached_property + def archive_path(self): + return self.url_path + + @cached_property + def archive_size(self): + if hasattr(self, "output_size_sum"): + return int(self.output_size_sum or 0) + + prefetched_results = None + if hasattr(self, "_prefetched_objects_cache"): + prefetched_results = self._prefetched_objects_cache.get("archiveresult_set") + if prefetched_results is not None: + return sum(result.output_size or result.output_size_from_files() for result in prefetched_results) + + stats = self.archiveresult_set.aggregate(result_count=models.Count("id"), total_size=models.Sum("output_size")) + if stats["result_count"]: + return int(stats["total_size"] or 0) + try: + return get_dir_size(self.output_dir)[0] + except Exception: + return 0 + + def save_tags(self, tags: Iterable[str] = ()) -> None: + tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] + self.tags.clear() + self.tags.add(*tags_id) + + def pending_archiveresults(self) -> QuerySet["ArchiveResult"]: + return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) + + def run(self) -> list["ArchiveResult"]: + """ + Execute snapshot by creating pending ArchiveResults for all enabled hooks. + + Returns: + list[ArchiveResult]: Newly created pending results + """ + return self.create_pending_archiveresults() + + def cleanup(self): + """ + Clean up background ArchiveResult hooks and empty results. + + Called by the state machine when entering the 'sealed' state. + Deletes empty ArchiveResults after the abx-dl cleanup phase has finished. + """ + # Clean up .pid files from output directory + if Path(self.output_dir).exists(): + for pid_file in Path(self.output_dir).glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + # Update all background ArchiveResults from filesystem (in case output arrived late) + results = self.archiveresult_set.filter(hook_name__contains=".bg.") + for ar in results: + ar.update_from_output() + + # Delete ArchiveResults that produced no output files + empty_ars = self.archiveresult_set.filter( + output_files={}, # No output files + ).filter( + status__in=ArchiveResult.FINAL_STATES, # Only delete finished ones + ) + + deleted_count = empty_ars.count() + if deleted_count > 0: + empty_ars.delete() + print(f"[yellow]đŸ—‘ī¸ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]") + + def to_json(self) -> dict: + """ + Convert Snapshot model instance to a JSON-serializable dict. + Includes all fields needed to fully reconstruct/identify this snapshot. + """ + from archivebox.config import VERSION + + archive_size = self.archive_size + + return { + "type": "Snapshot", + "schema_version": VERSION, + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "title": self.title, + "tags": self.tags_str(), + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "timestamp": self.timestamp, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, + "archive_size": archive_size, + "output_size": archive_size, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None, queue_for_extraction: bool = True): + """ + Create/update Snapshot from JSON dict. + + Unified method that handles: + - ID-based patching: {"id": "...", "title": "new title"} + - URL-based create/update: {"url": "...", "title": "...", "tags": "..."} + - Auto-creates Crawl if not provided + - Optionally queues for extraction + + Args: + record: Dict with 'url' (for create) or 'id' (for patch), plus other fields + overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id' + queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True) + + Returns: + Snapshot instance or None + """ + import re + from django.utils import timezone + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.config.common import GENERAL_CONFIG + + overrides = overrides or {} + + # If 'id' is provided, lookup and patch that specific snapshot + snapshot_id = record.get("id") + if snapshot_id: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + # Generically update all fields present in record + update_fields = [] + for field_name, value in record.items(): + # Skip internal fields + if field_name in ("id", "type"): + continue + + # Skip if field doesn't exist on model + if not hasattr(snapshot, field_name): + continue + + # Special parsing for date fields + if field_name in ("bookmarked_at", "retry_at", "created_at", "modified_at"): + if value and isinstance(value, str): + value = parse_date(value) + + # Update field if value is provided and different + if value is not None and getattr(snapshot, field_name) != value: + setattr(snapshot, field_name, value) + update_fields.append(field_name) + + if update_fields: + snapshot.save(update_fields=update_fields + ["modified_at"]) + + return snapshot + except Snapshot.DoesNotExist: + # ID not found, fall through to create-by-URL logic + pass + + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + url = sanitize_extracted_url(fix_url_from_markdown(str(record.get("url") or "").strip())) + if not url: + return None + + # Determine or create crawl (every snapshot must have a crawl) + crawl = overrides.get("crawl") + parent_snapshot = overrides.get("snapshot") # Parent snapshot + created_by_id = overrides.get("created_by_id") or ( + parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk() + ) + + # DEBUG: Check if crawl_id in record matches overrides crawl + import sys + + record_crawl_id = record.get("crawl_id") + if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id): + print( + f"[yellow]âš ī¸ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", + file=sys.stderr, + ) + + # If no crawl provided, inherit from parent or auto-create one + if not crawl: + if parent_snapshot: + # Inherit crawl from parent snapshot + crawl = parent_snapshot.crawl + else: + # Auto-create a single-URL crawl + from archivebox.crawls.models import Crawl + from archivebox.config import CONSTANTS + + timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") + sources_file = CONSTANTS.SOURCES_DIR / f"{timestamp_str}__auto_crawl.txt" + sources_file.parent.mkdir(parents=True, exist_ok=True) + sources_file.write_text(url) + + crawl = Crawl.objects.create( + urls=url, + max_depth=0, + label=f"auto-created for {url[:50]}", + created_by_id=created_by_id, + ) + print(f"[red]âš ī¸ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) + + # Parse tags (accept either a list ["tag1", "tag2"] or a comma-separated string "tag1,tag2") + tags_raw = record.get("tags", "") + tag_list = [] + if isinstance(tags_raw, list): + tag_list = list(dict.fromkeys(tag.strip() for tag in tags_raw if tag.strip())) + elif tags_raw: + tag_list = list( + dict.fromkeys(tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_raw) if tag.strip()), + ) + + # Check for existing snapshot with same URL in same crawl + # (URLs can exist in multiple crawls, but should be unique within a crawl) + snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by("-created_at").first() + + title = record.get("title") + timestamp = record.get("timestamp") + + if snapshot: + # Update existing snapshot + if title and (not snapshot.title or len(title) > len(snapshot.title or "")): + snapshot.title = title + snapshot.save(update_fields=["title", "modified_at"]) + else: + # Create new snapshot + if timestamp: + while Snapshot.objects.filter(timestamp=timestamp).exists(): + timestamp = str(float(timestamp) + 1.0) + + snapshot = Snapshot.objects.create( + url=url, + timestamp=timestamp, + title=title, + crawl=crawl, + ) + + # Update tags + if tag_list: + existing_tags = set(snapshot.tags.values_list("name", flat=True)) + new_tags = set(tag_list) | existing_tags + snapshot.save_tags(new_tags) + + # Queue for extraction and update additional fields + update_fields = [] + + if queue_for_extraction: + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + update_fields.extend(["status", "retry_at"]) + + # Update additional fields if provided + for field_name in ("depth", "parent_snapshot_id", "crawl_id", "bookmarked_at"): + value = record.get(field_name) + if value is not None and getattr(snapshot, field_name) != value: + setattr(snapshot, field_name, value) + update_fields.append(field_name) + + if update_fields: + snapshot.save(update_fields=update_fields + ["modified_at"]) + + snapshot.ensure_crawl_symlink() + + return snapshot + + def create_pending_archiveresults(self) -> list["ArchiveResult"]: + """ + Create ArchiveResult records for all enabled hooks. + + Uses the hooks system to discover available hooks from: + - abx_plugins/plugins/*/on_Snapshot__*.{py,sh,js} + - data/custom_plugins/*/on_Snapshot__*.{py,sh,js} + + Creates one ArchiveResult per hook (not per plugin), with hook_name set. + This enables step-based execution where all hooks in a step can run in parallel. + """ + from archivebox.hooks import discover_hooks + from archivebox.config.configset import get_config + + # Get merged config with crawl-specific PLUGINS filter + config = get_config(crawl=self.crawl, snapshot=self) + hooks = discover_hooks("Snapshot", config=config) + archiveresults = [] + + for hook_path in hooks: + hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py' + plugin = hook_path.parent.name # e.g., 'wget' + + # Check if AR already exists for this specific hook + if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists(): + continue + + archiveresult, created = ArchiveResult.objects.get_or_create( + snapshot=self, + hook_name=hook_name, + defaults={ + "plugin": plugin, + "status": ArchiveResult.INITIAL_STATE, + }, + ) + if archiveresult.status == ArchiveResult.INITIAL_STATE: + archiveresults.append(archiveresult) + + return archiveresults + + def is_finished_processing(self) -> bool: + """ + Check if all ArchiveResults are finished. + + Note: This is only called for observability/progress tracking. + The shared runner owns execution and does not poll this. + """ + # Check if any ARs are still pending/started + pending = self.archiveresult_set.exclude( + status__in=ArchiveResult.FINAL_STATES, + ).exists() + + return not pending + + def get_progress_stats(self) -> dict: + """ + Get progress statistics for this snapshot's archiving process. + + Returns dict with: + - total: Total number of archive results + - succeeded: Number of succeeded results + - failed: Number of failed results + - running: Number of currently running results + - pending: Number of pending/queued results + - percent: Completion percentage (0-100) + - output_size: Total output size in bytes + - is_sealed: Whether the snapshot is in a final state + """ + from django.db.models import Sum + + results = self.archiveresult_set.all() + + # Count by status + succeeded = results.filter(status="succeeded").count() + failed = results.filter(status="failed").count() + running = results.filter(status="started").count() + skipped = results.filter(status="skipped").count() + noresults = results.filter(status="noresults").count() + total = results.count() + pending = total - succeeded - failed - running - skipped - noresults + + # Calculate percentage (succeeded + failed + skipped + noresults as completed) + completed = succeeded + failed + skipped + noresults + percent = int((completed / total * 100) if total > 0 else 0) + + # Sum output sizes + output_size = results.aggregate(total_size=Sum("output_size"))["total_size"] or 0 + + # Check if sealed + is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED) + + return { + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": output_size, + "is_sealed": is_sealed, + } + + def retry_failed_archiveresults(self) -> int: + """ + Reset failed/skipped ArchiveResults to queued for retry. + + Returns count of ArchiveResults reset. + """ + count = self.archiveresult_set.filter( + status__in=[ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ], + ).update( + status=ArchiveResult.StatusChoices.QUEUED, + output_str="", + output_json=None, + output_files={}, + output_size=0, + output_mimetypes="", + start_ts=None, + end_ts=None, + ) + + if count > 0: + self.status = self.StatusChoices.QUEUED + self.retry_at = timezone.now() + self.current_step = 0 # Reset to step 0 for retry + self.save(update_fields=["status", "retry_at", "current_step", "modified_at"]) + + return count + + # ========================================================================= + # URL Helper Properties (migrated from Link schema) + # ========================================================================= + + @cached_property + def url_hash(self) -> str: + from hashlib import sha256 + + return sha256(self.url.encode()).hexdigest()[:8] + + @cached_property + def scheme(self) -> str: + return self.url.split("://")[0] + + @cached_property + def path(self) -> str: + parts = self.url.split("://", 1) + return "/" + parts[1].split("/", 1)[1] if len(parts) > 1 and "/" in parts[1] else "/" + + @cached_property + def basename(self) -> str: + return self.path.split("/")[-1] + + @cached_property + def extension(self) -> str: + basename = self.basename + return basename.split(".")[-1] if "." in basename else "" + + @cached_property + def base_url(self) -> str: + return f"{self.scheme}://{self.domain}" + + @cached_property + def is_static(self) -> bool: + static_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".mp4", ".mp3", ".wav", ".webm"} + return any(self.url.lower().endswith(ext) for ext in static_extensions) + + @cached_property + def is_archived(self) -> bool: + if self.downloaded_at or self.status == self.StatusChoices.SEALED: + return True + + output_paths = ( + self.domain, + "output.html", + "output.pdf", + "screenshot.png", + "singlefile.html", + "readability/content.html", + "mercury/content.html", + "htmltotext.txt", + "media", + "git", + ) + return any((Path(self.output_dir) / path).exists() for path in output_paths) + + # ========================================================================= + # Date/Time Properties (migrated from Link schema) + # ========================================================================= + + @cached_property + def bookmarked_date(self) -> str | None: + max_ts = (timezone.now() + timedelta(days=30)).timestamp() + if self.timestamp and self.timestamp.replace(".", "").isdigit(): + if 0 < float(self.timestamp) < max_ts: + return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp))) + return str(self.timestamp) + return None + + @cached_property + def downloaded_datestr(self) -> str | None: + return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None + + @cached_property + def archive_dates(self) -> list[datetime]: + return [result.start_ts for result in self.archiveresult_set.all() if result.start_ts] + + @cached_property + def oldest_archive_date(self) -> datetime | None: + dates = self.archive_dates + return min(dates) if dates else None + + @cached_property + def newest_archive_date(self) -> datetime | None: + dates = self.archive_dates + return max(dates) if dates else None + + @cached_property + def num_outputs(self) -> int: + return self.archiveresult_set.filter(status="succeeded").count() + + @cached_property + def num_failures(self) -> int: + return self.archiveresult_set.filter(status="failed").count() + + # ========================================================================= + # Output Path Methods (migrated from Link schema) + # ========================================================================= + + def latest_outputs(self, status: str | None = None) -> dict[str, Any]: + """Get the latest output that each plugin produced""" + from archivebox.hooks import get_plugins + from django.db.models import Q + + latest: dict[str, Any] = {} + for plugin in get_plugins(): + results = self.archiveresult_set.filter(plugin=plugin) + if status is not None: + results = results.filter(status=status) + # Filter for results with output_files or output_str + results = results.filter(Q(output_files__isnull=False) | ~Q(output_str="")).order_by("-start_ts") + result = results.first() + # Return embed_path() for backwards compatibility + latest[plugin] = result.embed_path() if result else None + return latest + + def discover_outputs(self, include_filesystem_fallback: bool = True) -> list[dict]: + """Discover output files from ArchiveResults and filesystem.""" + from archivebox.misc.util import ts_to_date_str + + ArchiveResult = self.archiveresult_set.model + snap_dir = Path(self.output_dir) + outputs: list[dict] = [] + seen: set[str] = set() + + text_exts = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") + + def is_metadata_path(path: str | None) -> bool: + lower = (path or "").lower() + return lower.endswith(text_exts) + + def is_compact_path(path: str | None) -> bool: + lower = (path or "").lower() + return lower.endswith(text_exts) + + for result in self.archiveresult_set.all().order_by("start_ts"): + embed_path = result.embed_path_db() + if not embed_path and include_filesystem_fallback: + embed_path = result.embed_path() + if not embed_path or embed_path.strip() in (".", "/", "./"): + continue + size = result.output_size or result.output_size_from_files() or self.hashes_index.get(embed_path, {}).get("size") or 0 + if not size and include_filesystem_fallback: + abs_path = snap_dir / embed_path + if not abs_path.exists(): + continue + if abs_path.is_dir(): + if not any(p.is_file() for p in abs_path.rglob("*")): + continue + size = sum(p.stat().st_size for p in abs_path.rglob("*") if p.is_file()) + else: + size = abs_path.stat().st_size + plugin_lower = (result.plugin or "").lower() + if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl"): + plugin_dir = snap_dir / result.plugin + if plugin_dir.exists(): + try: + size = sum(p.stat().st_size for p in plugin_dir.rglob("*") if p.is_file()) + except OSError: + pass + outputs.append( + { + "name": result.plugin, + "path": embed_path, + "ts": ts_to_date_str(result.end_ts), + "size": size or 0, + "is_metadata": is_metadata_path(embed_path), + "is_compact": is_compact_path(embed_path), + "result": result, + }, + ) + seen.add(result.plugin) + + hashes_index = self.hashes_index + if hashes_index: + grouped_hash_outputs: dict[str, dict[str, dict[str, Any]]] = {} + ignored_roots = {"index.html", "index.json", "index.jsonl", "favicon.ico", "warc", "hashes"} + for rel_path, meta in hashes_index.items(): + parts = Path(rel_path).parts + if len(parts) < 2: + continue + root = parts[0] + if root.startswith(".") or root in seen or root in ignored_roots: + continue + child_path = str(Path(*parts[1:])) + grouped_hash_outputs.setdefault(root, {})[child_path] = meta + + fallback_ts = ts_to_date_str(self.downloaded_at or self.created_at) + for root, root_entries in grouped_hash_outputs.items(): + fallback_path = ArchiveResult._fallback_output_file_path(list(root_entries.keys()), root, root_entries) + if not fallback_path: + continue + fallback_meta = root_entries.get(fallback_path, {}) + outputs.append( + { + "name": root, + "path": f"{root}/{fallback_path}", + "ts": fallback_ts, + "size": int(fallback_meta.get("size") or 0), + "is_metadata": is_metadata_path(fallback_path), + "is_compact": is_compact_path(fallback_path), + "result": None, + }, + ) + seen.add(root) + + if not include_filesystem_fallback: + return outputs + + embeddable_exts = { + "html", + "htm", + "pdf", + "txt", + "md", + "json", + "jsonl", + "csv", + "tsv", + "png", + "jpg", + "jpeg", + "gif", + "webp", + "svg", + "ico", + "mp4", + "webm", + "mp3", + "opus", + "ogg", + "wav", + } + + for entry in snap_dir.iterdir(): + if entry.name in ("index.html", "index.json", "favicon.ico", "warc"): + continue + if entry.is_dir(): + plugin = entry.name + if plugin in seen: + continue + best_file = ArchiveResult._find_best_output_file(entry, plugin) + if not best_file: + continue + best_file_stat = best_file.stat() + rel_path = str(best_file.relative_to(snap_dir)) + outputs.append( + { + "name": plugin, + "path": rel_path, + "ts": ts_to_date_str(best_file_stat.st_mtime or 0), + "size": best_file_stat.st_size or 0, + "is_metadata": is_metadata_path(rel_path), + "is_compact": is_compact_path(rel_path), + "result": None, + }, + ) + seen.add(plugin) + elif entry.is_file(): + ext = entry.suffix.lstrip(".").lower() + if ext not in embeddable_exts: + continue + plugin = entry.stem + if plugin in seen: + continue + entry_stat = entry.stat() + outputs.append( + { + "name": plugin, + "path": entry.name, + "ts": ts_to_date_str(entry_stat.st_mtime or 0), + "size": entry_stat.st_size or 0, + "is_metadata": is_metadata_path(entry.name), + "is_compact": is_compact_path(entry.name), + "result": None, + }, + ) + seen.add(plugin) + + return outputs + + # ========================================================================= + # Serialization Methods + # ========================================================================= + + def to_dict(self, extended: bool = False) -> dict[str, Any]: + """Convert Snapshot to a dictionary (replacement for Link._asdict())""" + from archivebox.core.host_utils import build_snapshot_url + + archive_size = self.archive_size + + result = { + "TYPE": "core.models.Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "timestamp": self.timestamp, + "title": self.title, + "tags": sorted(tag.name for tag in self.tags.all()), + "downloaded_at": self.downloaded_at.isoformat() if self.downloaded_at else None, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "modified_at": self.modified_at.isoformat() if self.modified_at else None, + "retry_at": self.retry_at.isoformat() if self.retry_at else None, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, + # Computed properties + "domain": self.domain, + "scheme": self.scheme, + "base_url": self.base_url, + "path": self.path, + "basename": self.basename, + "extension": self.extension, + "is_static": self.is_static, + "is_archived": self.is_archived, + "archive_path": self.archive_path, + "archive_url": build_snapshot_url(str(self.id), "index.html"), + "output_dir": self.output_dir, + "link_dir": self.output_dir, # backwards compatibility alias + "archive_size": archive_size, + "output_size": archive_size, + "bookmarked_date": self.bookmarked_date, + "downloaded_datestr": self.downloaded_datestr, + "num_outputs": self.num_outputs, + "num_failures": self.num_failures, + } + return result + + def to_json_str(self, indent: int = 4) -> str: + """Convert to JSON string (legacy method, use to_json() for dict)""" + return to_json(self.to_dict(extended=True), indent=indent) + + def to_csv(self, cols: list[str] | None = None, separator: str = ",", ljust: int = 0) -> str: + """Convert to CSV string""" + data = self.to_dict() + cols = cols or ["timestamp", "is_archived", "url"] + return separator.join(to_json(data.get(col, ""), indent=None).ljust(ljust) for col in cols) + + def write_json_details(self, out_dir: Path | str | None = None) -> None: + """Write JSON index file for this snapshot to its output directory""" + output_dir = Path(out_dir) if out_dir is not None else self.output_dir + path = output_dir / CONSTANTS.JSON_INDEX_FILENAME + atomic_write(str(path), self.to_dict(extended=True)) + + def write_html_details(self, out_dir: Path | str | None = None) -> None: + """Write HTML detail page for this snapshot to its output directory""" + from django.template.loader import render_to_string + from archivebox.config.common import SERVER_CONFIG + from archivebox.config.configset import get_config + from archivebox.core.widgets import TagEditorWidget + from archivebox.misc.logging_util import printable_filesize + + output_dir = Path(out_dir) if out_dir is not None else self.output_dir + config = get_config() + SAVE_ARCHIVE_DOT_ORG = config.get("SAVE_ARCHIVE_DOT_ORG", True) + TITLE_LOADING_MSG = "Not yet archived..." + + preview_priority = [ + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", + ] + + outputs = self.discover_outputs(include_filesystem_fallback=True) + loose_items, failed_items = self.get_detail_page_auxiliary_items(outputs) + outputs_by_plugin = {out["name"]: out for out in outputs} + output_size = sum(int(out.get("size") or 0) for out in outputs) + is_archived = bool(outputs or self.downloaded_at or self.status == self.StatusChoices.SEALED) + + best_preview_path = "about:blank" + best_result = {"path": "about:blank", "result": None} + for plugin in preview_priority: + out = outputs_by_plugin.get(plugin) + if out and out.get("path"): + best_preview_path = str(out["path"]) + best_result = out + break + + if best_preview_path == "about:blank" and outputs: + best_preview_path = str(outputs[0].get("path") or "about:blank") + best_result = outputs[0] + tag_widget = TagEditorWidget() + context = { + **self.to_dict(extended=True), + "snapshot": self, + "title": htmlencode(self.resolved_title or (self.base_url if is_archived else TITLE_LOADING_MSG)), + "url_str": htmlencode(urldecode(self.base_url)), + "archive_url": urlencode(f"warc/{self.timestamp}" or (self.domain if is_archived else "")) or "about:blank", + "extension": self.extension or "html", + "tags": self.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "pending", + "status": "archived" if is_archived else "not yet archived", + "status_color": "success" if is_archived else "danger", + "oldest_archive_date": ts_to_date_str(self.oldest_archive_date), + "SAVE_ARCHIVE_DOT_ORG": SAVE_ARCHIVE_DOT_ORG, + "PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS, + "best_preview_path": best_preview_path, + "best_result": best_result, + "archiveresults": outputs, + "loose_items": loose_items, + "failed_items": failed_items, + "related_snapshots": [], + "related_years": [], + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in self.tags.all().order_by("name")], + } + rendered_html = render_to_string("core/snapshot.html", context) + atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) + + # ========================================================================= + # Helper Methods + # ========================================================================= + + def get_detail_page_auxiliary_items( + self, + outputs: list[dict] | None = None, + hidden_card_plugins: set[str] | None = None, + ) -> tuple[list[dict[str, object]], list[dict[str, object]]]: + outputs = outputs or self.discover_outputs(include_filesystem_fallback=True) + hidden_card_plugins = hidden_card_plugins or set() + accounted_entries: set[str] = set() + for output in outputs: + output_name = str(output.get("name") or "") + if output_name: + accounted_entries.add(output_name) + output_path = str(output.get("path") or "") + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + ignore_names = {".DS_Store", "index.html", "index.json", "index.jsonl", "favicon.ico"} + loose_items: list[dict[str, object]] = [] + if self.hashes_index: + grouped: dict[str, dict[str, object]] = {} + for rel_path, meta in self.hashes_index.items(): + parts = Path(rel_path).parts + if not parts: + continue + root = parts[0] + if root.startswith(".") or root in ignore_names or root in accounted_entries: + continue + entry = grouped.setdefault( + root, + { + "name": root, + "path": root, + "is_dir": len(parts) > 1 or bool(meta.get("is_dir")), + "size": 0, + }, + ) + entry["is_dir"] = bool(entry.get("is_dir")) or len(parts) > 1 or bool(meta.get("is_dir")) + entry["size"] = int(entry.get("size") or 0) + int(meta.get("size") or 0) + loose_items = sorted(grouped.values(), key=lambda item: str(item["name"]).lower()) + + ArchiveResult = self.archiveresult_set.model + failed_items: list[dict[str, object]] = [] + seen_failed: set[str] = set() + for result in self.archiveresult_set.all().order_by("start_ts"): + if result.status != ArchiveResult.StatusChoices.FAILED: + continue + root = str(result.plugin or "").strip() + if not root or root in seen_failed: + continue + seen_failed.add(root) + failed_items.append( + { + "name": f"{get_plugin_name(root)} ({result.status})", + "path": root, + "is_dir": True, + "size": int(result.output_size or 0), + }, + ) + + return loose_items, failed_items + + @staticmethod + def _ts_to_date_str(dt: datetime | None) -> str | None: + return dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None + + +# ============================================================================= +# Snapshot State Machine +# ============================================================================= + + +class SnapshotMachine(BaseStateMachine): + """ + State machine for managing Snapshot lifecycle. + + Hook Lifecycle: + ┌─────────────────────────────────────────────────────────────┐ + │ QUEUED State │ + │ â€ĸ Waiting for snapshot to be ready │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when can_start() + ┌─────────────────────────────────────────────────────────────┐ + │ STARTED State → enter_started() │ + │ 1. snapshot.run() │ + │ â€ĸ discover_hooks('Snapshot') → finds all plugin hooks │ + │ â€ĸ create_pending_archiveresults() → creates ONE │ + │ ArchiveResult per hook (NO execution yet) │ + │ 2. The shared abx-dl runner executes hooks and the │ + │ projector updates ArchiveResult rows from events │ + │ 3. Advance through steps 0-9 as foreground hooks complete │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when is_finished() + ┌─────────────────────────────────────────────────────────────┐ + │ SEALED State → enter_sealed() │ + │ â€ĸ cleanup() → kills any background hooks still running │ + │ â€ĸ Set retry_at=None (no more processing) │ + └─────────────────────────────────────────────────────────────┘ + + https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams + """ + + model_attr_name = "snapshot" + + # States + queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True) + started = State(value=Snapshot.StatusChoices.STARTED) + sealed = State(value=Snapshot.StatusChoices.SEALED, final=True) + + # Tick Event (polled by workers) + tick = queued.to.itself(unless="can_start") | queued.to(started, cond="can_start") | started.to(sealed, cond="is_finished") + + # Manual event (can also be triggered by last ArchiveResult finishing) + seal = started.to(sealed) + + snapshot: Snapshot + + def can_start(self) -> bool: + can_start = bool(self.snapshot.url) + return can_start + + def is_finished(self) -> bool: + """Check if all ArchiveResults for this snapshot are finished.""" + return self.snapshot.is_finished_processing() + + @queued.enter + def enter_queued(self): + self.snapshot.update_and_requeue( + retry_at=timezone.now(), + status=Snapshot.StatusChoices.QUEUED, + ) + + @started.enter + def enter_started(self): + """Just mark as started. The shared runner creates ArchiveResults and runs hooks.""" + self.snapshot.status = Snapshot.StatusChoices.STARTED + self.snapshot.retry_at = None # No more polling + self.snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + + @sealed.enter + def enter_sealed(self): + import sys + + # Clean up background hooks + self.snapshot.cleanup() + + self.snapshot.update_and_requeue( + retry_at=None, + status=Snapshot.StatusChoices.SEALED, + ) + + print(f"[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]", file=sys.stderr) + + # Check if this is the last snapshot for the parent crawl - if so, seal the crawl + if self.snapshot.crawl: + crawl = self.snapshot.crawl + remaining_active = Snapshot.objects.filter( + crawl=crawl, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).count() + + if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED: + print(f"[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]", file=sys.stderr) + # Seal the parent crawl + cast(Any, crawl).sm.seal() + + +class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + STARTED = "started", "Started" + BACKOFF = "backoff", "Waiting to retry" + SUCCEEDED = "succeeded", "Succeeded" + FAILED = "failed", "Failed" + SKIPPED = "skipped", "Skipped" + NORESULTS = "noresults", "No Results" + + INITIAL_STATE = StatusChoices.QUEUED + ACTIVE_STATE = StatusChoices.STARTED + FINAL_STATES = ( + StatusChoices.SUCCEEDED, + StatusChoices.FAILED, + StatusChoices.SKIPPED, + StatusChoices.NORESULTS, + ) + FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE) + + @classmethod + def get_plugin_choices(cls): + """Get plugin choices from discovered hooks (for forms/admin).""" + plugins = [get_plugin_name(e) for e in get_plugins()] + return tuple((e, e) for e in plugins) + + # UUID primary key (migrated from integer in 0029) + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore + # No choices= constraint - plugin names come from plugin system and can be any string + plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True, default="") + hook_name = models.CharField( + max_length=255, + blank=True, + default="", + db_index=True, + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + ) + + # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.) + # Added POST-v0.9.0, will be added in a separate migration + process = models.OneToOneField( + "machine.Process", + on_delete=models.PROTECT, + null=True, + blank=True, + related_name="archiveresult", + help_text="Process execution details for this archive result", + ) + + # New output fields (replacing old 'output' field) + output_str = models.TextField(blank=True, default="", help_text="Human-readable output summary") + output_json = models.JSONField(null=True, blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)") + output_files = models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}") + output_size = models.BigIntegerField(default=0, help_text="Total bytes of all output files") + output_mimetypes = models.CharField(max_length=512, blank=True, default="", help_text="CSV of mimetypes sorted by size") + + start_ts = models.DateTimeField(default=None, null=True, blank=True) + end_ts = models.DateTimeField(default=None, null=True, blank=True) + + status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True) + notes = models.TextField(blank=True, null=False, default="") + # output_dir is computed via @property from snapshot.output_dir / plugin + + snapshot_id: uuid.UUID + process_id: uuid.UUID | None + + class Meta( + ModelWithOutputDir.Meta, + ModelWithConfig.Meta, + ModelWithNotes.Meta, + ): + app_label = "core" + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results Log" + indexes = [ + models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), + ] + + def __str__(self): + return f"[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}" + + @property + def created_by(self): + """Convenience property to access the user who created this archive result via its snapshot's crawl.""" + return self.snapshot.crawl.created_by + + def to_json(self) -> dict: + """ + Convert ArchiveResult model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + record = { + "type": "ArchiveResult", + "schema_version": VERSION, + "id": str(self.id), + "snapshot_id": str(self.snapshot_id), + "plugin": self.plugin, + "hook_name": self.hook_name, + "status": self.status, + "output_str": self.output_str, + "start_ts": self.start_ts.isoformat() if self.start_ts else None, + "end_ts": self.end_ts.isoformat() if self.end_ts else None, + } + # Include optional fields if set + if self.output_json: + record["output_json"] = self.output_json + if self.output_files: + record["output_files"] = self.output_files + if self.output_size: + record["output_size"] = self.output_size + if self.output_mimetypes: + record["output_mimetypes"] = self.output_mimetypes + if self.cmd: + record["cmd"] = self.cmd + if self.cmd_version: + record["cmd_version"] = self.cmd_version + if self.process_id: + record["process_id"] = str(self.process_id) + return record + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update ArchiveResult from JSON dict. + + Args: + record: JSON dict with 'snapshot_id', 'plugin', etc. + overrides: Optional dict of field overrides + + Returns: + ArchiveResult instance or None + """ + snapshot_id = record.get("snapshot_id") + plugin = record.get("plugin") + + if not snapshot_id or not plugin: + return None + + # Try to get existing by ID first + result_id = record.get("id") + if result_id: + try: + return ArchiveResult.objects.get(id=result_id) + except ArchiveResult.DoesNotExist: + pass + + # Get or create by snapshot_id + plugin + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + result, _ = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + defaults={ + "hook_name": record.get("hook_name", ""), + "status": record.get("status", "queued"), + "output_str": record.get("output_str", ""), + }, + ) + return result + except Snapshot.DoesNotExist: + return None + + def save(self, *args, **kwargs): + # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories + # Call the Django Model.save() directly instead + models.Model.save(self, *args, **kwargs) + + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created ArchiveResult', + # indent_level=3, + # plugin=self.plugin, + # metadata={ + # 'id': str(self.id), + # 'snapshot_id': str(self.snapshot_id), + # 'snapshot_url': str(self.snapshot.url)[:64], + # 'status': self.status, + # }, + # ) + + @cached_property + def snapshot_dir(self): + return Path(self.snapshot.output_dir) + + @cached_property + def url(self): + return self.snapshot.url + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_archiveresult", args=[self.id])) + + def get_absolute_url(self): + return f"/{self.snapshot.archive_path}/{self.plugin}" + + def reset_for_retry(self, *, save: bool = True) -> None: + self.status = self.StatusChoices.QUEUED + self.output_str = "" + self.output_json = None + self.output_files = {} + self.output_size = 0 + self.output_mimetypes = "" + self.start_ts = None + self.end_ts = None + if save: + self.save( + update_fields=[ + "status", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "start_ts", + "end_ts", + "modified_at", + ], + ) + + @property + def plugin_module(self) -> Any | None: + # Hook scripts are now used instead of Python plugin modules + # The plugin name maps to hooks in abx_plugins/plugins/{plugin}/ + return None + + @staticmethod + def _normalize_output_files(raw_output_files: Any) -> dict[str, dict[str, Any]]: + from abx_dl.output_files import guess_mimetype + + def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]: + normalized = dict(metadata) + if "extension" not in normalized: + normalized["extension"] = Path(path).suffix.lower().lstrip(".") + if "mimetype" not in normalized: + guessed = guess_mimetype(path) + if guessed: + normalized["mimetype"] = guessed + return normalized + + if raw_output_files is None: + return {} + if isinstance(raw_output_files, str): + try: + raw_output_files = json.loads(raw_output_files) + except json.JSONDecodeError: + return {} + if isinstance(raw_output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in raw_output_files.items(): + if not path: + continue + metadata_dict = dict(metadata) if isinstance(metadata, dict) else {} + metadata_dict.pop("path", None) + normalized[str(path)] = _enrich_metadata(str(path), metadata_dict) + return normalized + if isinstance(raw_output_files, (list, tuple, set)): + normalized: dict[str, dict[str, Any]] = {} + for item in raw_output_files: + if isinstance(item, str): + normalized[item] = _enrich_metadata(item, {}) + continue + if not isinstance(item, dict): + continue + path = str(item.get("path") or "").strip() + if not path: + continue + normalized[path] = _enrich_metadata( + path, + {key: value for key, value in item.items() if key != "path" and value not in (None, "")}, + ) + return normalized + return {} + + @staticmethod + def _coerce_output_file_size(value: Any) -> int: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return 0 + + def output_file_map(self) -> dict[str, dict[str, Any]]: + return self._normalize_output_files(self.output_files) + + def output_file_paths(self) -> list[str]: + return list(self.output_file_map().keys()) + + def output_file_count(self) -> int: + return len(self.output_file_paths()) + + def output_size_from_files(self) -> int: + return sum(self._coerce_output_file_size(metadata.get("size")) for metadata in self.output_file_map().values()) + + def output_exists(self) -> bool: + return os.path.exists(Path(self.snapshot_dir) / self.plugin) + + @staticmethod + def _looks_like_output_path(raw_output: str | None, plugin_name: str | None = None) -> bool: + value = str(raw_output or "").strip() + if value in ("", ".", "./", "/"): + return False + if plugin_name and value.startswith(f"{plugin_name}/"): + return True + if Path(value).is_absolute(): + return True + if Path(value).suffix: + return True + if "/" in value and "\\" not in value and " " not in value: + left, _, right = value.partition("/") + if left and right and all(ch.isalnum() or ch in "+-." for ch in left + right): + return False + return False + + def _existing_output_path(self, raw_output: str | None) -> str | None: + value = str(raw_output or "").strip() + if not value: + return None + + output_path = Path(value) + snapshot_dir = Path(self.snapshot_dir).resolve(strict=False) + candidates: list[str] = [] + + if output_path.is_absolute(): + try: + candidates.append(str(output_path.resolve(strict=False).relative_to(snapshot_dir))) + except (OSError, ValueError): + return None + elif value.startswith(f"{self.plugin}/"): + candidates.append(value) + elif len(output_path.parts) == 1: + candidates.append(f"{self.plugin}/{value}") + else: + candidates.append(value) + + output_file_map = self.output_file_map() + hashes_index = self.snapshot.hashes_index + for relative_path in candidates: + if relative_path in hashes_index: + return relative_path + + if relative_path in output_file_map: + return relative_path + + plugin_relative = relative_path.removeprefix(f"{self.plugin}/") + if plugin_relative in output_file_map: + return relative_path + + candidate = snapshot_dir / relative_path + try: + if candidate.is_file(): + return relative_path + except OSError: + continue + + return None + + @staticmethod + def _fallback_output_file_path( + output_file_paths: Sequence[str], + plugin_name: str | None = None, + output_file_map: dict[str, dict[str, Any]] | None = None, + ) -> str | None: + ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid", "cmd.sh"} + candidates = [ + path + for path in output_file_paths + if Path(path).name not in ignored and Path(path).suffix.lower() not in (".pid", ".log", ".sh") + ] + if not candidates: + return None + + output_file_map = output_file_map or {} + preferred_names = [ + "index.html", + "index.htm", + "output.html", + "content.html", + "article.html", + "output.pdf", + "index.pdf", + "content.txt", + "output.txt", + "index.txt", + "index.md", + "index.json", + "article.json", + ] + for preferred_name in preferred_names: + for candidate in candidates: + if Path(candidate).name.lower() == preferred_name: + return candidate + + ext_groups = ( + (".html", ".htm", ".pdf"), + (".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico"), + (".json", ".jsonl", ".txt", ".md", ".csv", ".tsv"), + (".mp4", ".webm", ".mp3", ".opus", ".ogg", ".wav"), + ) + for ext_group in ext_groups: + group_candidates = [candidate for candidate in candidates if Path(candidate).suffix.lower() in ext_group] + if group_candidates: + return max( + group_candidates, + key=lambda path: ArchiveResult._coerce_output_file_size(output_file_map.get(path, {}).get("size")), + ) + + return None + + @staticmethod + def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Path | None: + if not dir_path.exists() or not dir_path.is_dir(): + return None + file_map: dict[str, dict[str, Any]] = {} + file_count = 0 + max_scan = 500 + for file_path in dir_path.rglob("*"): + file_count += 1 + if file_count > max_scan: + break + if file_path.is_dir() or file_path.name.startswith("."): + continue + rel_path = str(file_path.relative_to(dir_path)) + try: + size = file_path.stat().st_size + except OSError: + size = 0 + file_map[rel_path] = {"size": size} + + fallback_path = ArchiveResult._fallback_output_file_path(list(file_map.keys()), plugin_name, file_map) + if not fallback_path: + return None + return dir_path / fallback_path + + def embed_path_db(self) -> str | None: + output_file_map = self.output_file_map() + + if self.output_str: + raw_output = str(self.output_str).strip() + if self._looks_like_output_path(raw_output, self.plugin): + existing_output = self._existing_output_path(raw_output) + if existing_output: + return existing_output + + output_file_paths = list(output_file_map.keys()) + if output_file_paths: + fallback_path = self._fallback_output_file_path(output_file_paths, self.plugin, output_file_map) + if fallback_path: + return f"{self.plugin}/{fallback_path}" + + return None + + def embed_path(self) -> str | None: + """ + Get the relative path to the embeddable output file for this result. + + This is intentionally DB-backed only so snapshot/admin rendering stays + fast and predictable without filesystem probes. + """ + return self.embed_path_db() + + @property + def output_dir_name(self) -> str: + return self.plugin + + @property + def output_dir_parent(self) -> str: + return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR)) + + # Properties that delegate to Process model (for backwards compatibility) + # These properties will replace the direct fields after migration is complete + # They allow existing code to continue using archiveresult.pwd, .cmd, etc. + + # Note: After migration 3 creates Process records and migration 5 removes the old fields, + # these properties provide seamless access to Process data through ArchiveResult + + # Uncommented after migration 3 completed - properties now active + @property + def pwd(self) -> str: + """Working directory (from Process).""" + return self.process.pwd if self.process_id else "" + + @property + def cmd(self) -> list: + """Command array (from Process).""" + return self.process.cmd if self.process_id else [] + + @property + def cmd_version(self) -> str: + """Command version (from Process.binary).""" + return self.process.cmd_version if self.process_id else "" + + @property + def binary(self): + """Binary FK (from Process).""" + return self.process.binary if self.process_id else None + + @property + def iface(self): + """Network interface FK (from Process).""" + return self.process.iface if self.process_id else None + + @property + def machine(self): + """Machine FK (from Process).""" + return self.process.machine if self.process_id else None + + @property + def timeout(self) -> int: + """Timeout in seconds (from Process).""" + return self.process.timeout if self.process_id else 120 + + def save_search_index(self): + pass + + def update_from_output(self): + """ + Update this ArchiveResult from filesystem logs and output files. + + Used for Snapshot cleanup / orphan recovery when a hook's output exists + on disk but the projector did not finalize the row in the database. + + Updates: + - status, output_str, output_json from ArchiveResult JSONL record + - output_files, output_size, output_mimetypes by walking filesystem + - end_ts, cmd, cmd_version, binary FK + - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() + """ + from collections import defaultdict + from pathlib import Path + from django.utils import timezone + from abx_dl.output_files import guess_mimetype + from archivebox.hooks import process_hook_records, extract_records_from_process + from archivebox.machine.models import Process + + plugin_dir = Path(self.pwd) if self.pwd else None + if not plugin_dir or not plugin_dir.exists(): + self.status = self.StatusChoices.FAILED + self.output_str = "Output directory not found" + self.end_ts = timezone.now() + self.save() + return + + # Read and parse JSONL output from stdout.log + stdout_file = plugin_dir / "stdout.log" + records = [] + if self.process_id and self.process: + records = extract_records_from_process(self.process) + + if not records: + stdout = stdout_file.read_text() if stdout_file.exists() else "" + records = Process.parse_records_from_text(stdout) + + # Find ArchiveResult record and update status/output from it + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + if ar_records: + hook_data = ar_records[0] + + # Update status + status_map = { + "succeeded": self.StatusChoices.SUCCEEDED, + "failed": self.StatusChoices.FAILED, + "skipped": self.StatusChoices.SKIPPED, + "noresults": self.StatusChoices.NORESULTS, + } + self.status = status_map.get(hook_data.get("status", "failed"), self.StatusChoices.FAILED) + + # Update output fields + self.output_str = hook_data.get("output_str") or hook_data.get("output") or "" + self.output_json = hook_data.get("output_json") + + # Update cmd fields + if hook_data.get("cmd"): + if self.process_id: + self.process.cmd = hook_data["cmd"] + self.process.save() + self._set_binary_from_cmd(hook_data["cmd"]) + # Note: cmd_version is derived from binary.version, not stored on Process + else: + # No ArchiveResult record: treat background hooks or clean exits as skipped + is_background = False + try: + from archivebox.hooks import is_background_hook + + is_background = bool(self.hook_name and is_background_hook(self.hook_name)) + except Exception: + pass + + if is_background or (self.process_id and self.process and self.process.exit_code == 0): + self.status = self.StatusChoices.SKIPPED + self.output_str = "Hook did not output ArchiveResult record" + else: + self.status = self.StatusChoices.FAILED + self.output_str = "Hook did not output ArchiveResult record" + + # Walk filesystem and populate output_files, output_size, output_mimetypes + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"} + mime_sizes = defaultdict(int) + total_size = 0 + output_files = {} + + for file_path in plugin_dir.rglob("*"): + if not file_path.is_file(): + continue + if ".hooks" in file_path.parts: + continue + if file_path.name in exclude_names: + continue + + try: + stat = file_path.stat() + mime_type = guess_mimetype(file_path) or "application/octet-stream" + + relative_path = str(file_path.relative_to(plugin_dir)) + output_files[relative_path] = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + except OSError: + continue + + self.output_files = output_files + self.output_size = total_size + sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) + self.output_mimetypes = ",".join(mime for mime, _ in sorted_mimes) + + # Update timestamps + self.end_ts = timezone.now() + + self.save() + + # Process side-effect records (filter Snapshots for depth/URL) + filtered_records = [] + for record in records: + record_type = record.get("type") + + # Skip ArchiveResult records (already processed above) + if record_type == "ArchiveResult": + continue + + # Filter Snapshot records for depth/URL constraints + if record_type == "Snapshot": + url = record.get("url") + if not url: + continue + + depth = record.get("depth", self.snapshot.depth + 1) + if depth > self.snapshot.crawl.max_depth: + continue + + if not self._url_passes_filters(url): + continue + + filtered_records.append(record) + + # Process filtered records with unified dispatcher + overrides = { + "snapshot": self.snapshot, + "crawl": self.snapshot.crawl, + "created_by_id": self.created_by.pk, + } + process_hook_records(filtered_records, overrides=overrides) + + # Cleanup PID files (keep logs even if empty so they can be tailed) + pid_file = plugin_dir / "hook.pid" + pid_file.unlink(missing_ok=True) + + def _set_binary_from_cmd(self, cmd: list) -> None: + """ + Find Binary for command and set binary FK. + + Tries matching by absolute path first, then by binary name. + Only matches binaries on the current machine. + """ + if not cmd: + return + + from archivebox.machine.models import Machine + + bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd + machine = Machine.current() + + # Try matching by absolute path first + binary = Binary.objects.filter( + abspath=bin_path_or_name, + machine=machine, + ).first() + + if binary: + if self.process_id: + self.process.binary = binary + self.process.save() + return + + # Fallback: match by binary name + bin_name = Path(bin_path_or_name).name + binary = Binary.objects.filter( + name=bin_name, + machine=machine, + ).first() + + if binary: + if self.process_id: + self.process.binary = binary + self.process.save() + + def _url_passes_filters(self, url: str) -> bool: + """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters. + + Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot + """ + return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot) + + @property + def output_dir(self) -> Path: + """Get the output directory for this plugin's results.""" + return Path(self.snapshot.output_dir) / self.plugin + + +# ============================================================================= +# State Machine Registration +# ============================================================================= + +# Manually register state machines with python-statemachine registry +# (normally auto-discovered from statemachines.py, but we define them here for clarity) +registry.register(SnapshotMachine) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 550c6077c5..d3f8ef026a 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -1,138 +1,216 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" import os import sys -import re -import logging -import tempfile +import inspect +import importlib +from typing import Any, cast from pathlib import Path + +from django.conf.locale.en import formats as en_formats # type: ignore from django.utils.crypto import get_random_string -from ..config import ( - DEBUG, - SECRET_KEY, - ALLOWED_HOSTS, - PACKAGE_DIR, - TEMPLATES_DIR_NAME, - CUSTOM_TEMPLATES_DIR, - SQL_INDEX_FILENAME, - OUTPUT_DIR, - LOGS_DIR, - TIME_ZONE, -) - -IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] -IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ -IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] +import archivebox + +from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa +from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url +from .settings_logging import SETTINGS_LOGGING + + +IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3] +IS_TESTING = "test" in sys.argv[:3] or "PYTEST_CURRENT_TEST" in os.environ +IS_SHELL = "shell" in sys.argv[:3] or "shell_plus" in sys.argv[:3] +IS_GETTING_VERSION_OR_HELP = "version" in sys.argv or "help" in sys.argv or "--version" in sys.argv or "--help" in sys.argv + +################################################################################ +### ArchiveBox Plugin Settings +################################################################################ + +ALL_PLUGINS = archivebox.ALL_PLUGINS +LOADED_PLUGINS = archivebox.LOADED_PLUGINS ################################################################################ ### Django Core Settings ################################################################################ -WSGI_APPLICATION = 'core.wsgi.application' -ROOT_URLCONF = 'core.urls' +WSGI_APPLICATION = "archivebox.core.wsgi.application" +ASGI_APPLICATION = "archivebox.core.asgi.application" +ROOT_URLCONF = "archivebox.core.urls" -LOGIN_URL = '/accounts/login/' -LOGOUT_REDIRECT_URL = '/' -PASSWORD_RESET_URL = '/accounts/password_reset/' -APPEND_SLASH = True +LOGIN_URL = "/accounts/login/" +LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/") -DEBUG = DEBUG or ('--debug' in sys.argv) +PASSWORD_RESET_URL = "/accounts/password_reset/" +APPEND_SLASH = True -INSTALLED_APPS = [ - 'django.contrib.auth', - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.messages', - 'django.contrib.staticfiles', - 'django.contrib.admin', +DEBUG = SHELL_CONFIG.DEBUG or ("--debug" in sys.argv) - 'core', - 'django_extensions', +INSTALLED_APPS = [ + "daphne", + # Django default apps + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + # 3rd-party apps from PyPI + "signal_webhooks", # handles REST API outbound webhooks + "django_object_actions", # provides easy Django Admin action buttons on change views + # Our ArchiveBox-provided apps (use fully qualified names) + # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies + # "archivebox.config", # ArchiveBox config settings (no models, not a real Django app) + "archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc. + "archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors) + "archivebox.personas", # handles Persona and session management + "archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this) + "archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core) + "archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. + # ArchiveBox plugins (hook-based plugins no longer add Django apps) + # Use hooks.py discover_hooks() for plugin functionality + # 3rd-party apps from PyPI that need to be loaded last + "admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin + "django_extensions", # provides Django Debug Toolbar (and other non-debug helpers) ] MIDDLEWARE = [ - 'core.middleware.TimezoneMiddleware', - 'django.middleware.security.SecurityMiddleware', - 'django.contrib.sessions.middleware.SessionMiddleware', - 'django.middleware.common.CommonMiddleware', - 'django.middleware.csrf.CsrfViewMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.contrib.messages.middleware.MessageMiddleware', - 'core.middleware.CacheControlMiddleware', + "archivebox.core.middleware.TimezoneMiddleware", + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "archivebox.api.middleware.ApiCorsMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "archivebox.core.middleware.ReverseProxyAuthMiddleware", + "archivebox.core.middleware.ServerSecurityModeMiddleware", + "archivebox.core.middleware.HostRoutingMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "archivebox.core.middleware.CacheControlMiddleware", + # Additional middlewares from plugins (if any) ] + +################################################################################ +### Authentication Settings +################################################################################ + +# AUTH_USER_MODEL = 'auth.User' # cannot be easily changed unfortunately + AUTHENTICATION_BACKENDS = [ - 'django.contrib.auth.backends.ModelBackend', + "django.contrib.auth.backends.RemoteUserBackend", + "django.contrib.auth.backends.ModelBackend", + # Additional auth backends (e.g., LDAP) configured via settings ] -# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) -DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv) -if DEBUG_TOOLBAR: - try: - import debug_toolbar # noqa - DEBUG_TOOLBAR = True - except ImportError: - DEBUG_TOOLBAR = False -if DEBUG_TOOLBAR: - INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar'] - INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*'] - DEBUG_TOOLBAR_CONFIG = { - "SHOW_TOOLBAR_CALLBACK": lambda request: True, - "RENDER_PANELS": True, - } - DEBUG_TOOLBAR_PANELS = [ - 'debug_toolbar.panels.history.HistoryPanel', - 'debug_toolbar.panels.versions.VersionsPanel', - 'debug_toolbar.panels.timer.TimerPanel', - 'debug_toolbar.panels.settings.SettingsPanel', - 'debug_toolbar.panels.headers.HeadersPanel', - 'debug_toolbar.panels.request.RequestPanel', - 'debug_toolbar.panels.sql.SQLPanel', - 'debug_toolbar.panels.staticfiles.StaticFilesPanel', - # 'debug_toolbar.panels.templates.TemplatesPanel', - 'debug_toolbar.panels.cache.CachePanel', - 'debug_toolbar.panels.signals.SignalsPanel', - 'debug_toolbar.panels.logging.LoggingPanel', - 'debug_toolbar.panels.redirects.RedirectsPanel', - 'debug_toolbar.panels.profiling.ProfilingPanel', - 'djdt_flamegraph.FlamegraphPanel', - ] - MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware'] +# LDAP Authentication Configuration +# Conditionally loaded if LDAP_ENABLED=True and django-auth-ldap is installed +try: + from archivebox.config.ldap import LDAP_CONFIG + + if LDAP_CONFIG.LDAP_ENABLED: + # Validate LDAP configuration + is_valid, error_msg = LDAP_CONFIG.validate_ldap_config() + if not is_valid: + from rich import print + + print(f"[red][X] Error: {error_msg}[/red]") + raise ValueError(error_msg) + + try: + # Try to import django-auth-ldap (will fail if not installed) + LDAPSearch = importlib.import_module("django_auth_ldap.config").LDAPSearch + ldap = importlib.import_module("ldap") + + # Configure LDAP authentication + AUTH_LDAP_SERVER_URI = LDAP_CONFIG.LDAP_SERVER_URI + AUTH_LDAP_BIND_DN = LDAP_CONFIG.LDAP_BIND_DN + AUTH_LDAP_BIND_PASSWORD = LDAP_CONFIG.LDAP_BIND_PASSWORD + + # Configure user search + AUTH_LDAP_USER_SEARCH = LDAPSearch( + LDAP_CONFIG.LDAP_USER_BASE, + getattr(ldap, "SCOPE_SUBTREE", 2), + LDAP_CONFIG.LDAP_USER_FILTER, + ) + + # Map LDAP attributes to Django user model fields + AUTH_LDAP_USER_ATTR_MAP = { + "username": LDAP_CONFIG.LDAP_USERNAME_ATTR, + "first_name": LDAP_CONFIG.LDAP_FIRSTNAME_ATTR, + "last_name": LDAP_CONFIG.LDAP_LASTNAME_ATTR, + "email": LDAP_CONFIG.LDAP_EMAIL_ATTR, + } + + # Use custom LDAP backend that supports LDAP_CREATE_SUPERUSER + AUTHENTICATION_BACKENDS = [ + "archivebox.ldap.auth.ArchiveBoxLDAPBackend", + "django.contrib.auth.backends.RemoteUserBackend", + "django.contrib.auth.backends.ModelBackend", + ] + + except ImportError as e: + from rich import print + + print("[red][X] Error: LDAP_ENABLED=True but required LDAP libraries are not installed![/red]") + print(f"[red] {e}[/red]") + print("[yellow] To install LDAP support, run:[/yellow]") + print("[yellow] pip install archivebox[ldap][/yellow]") + print("[yellow] Or manually:[/yellow]") + print("[yellow] apt install build-essential python3-dev libsasl2-dev libldap2-dev libssl-dev[/yellow]") + print("[yellow] pip install python-ldap django-auth-ldap[/yellow]") + raise + +except ImportError: + # archivebox.config.ldap not available (shouldn't happen but handle gracefully) + pass ################################################################################ ### Staticfile and Template Settings ################################################################################ -STATIC_URL = '/static/' - +STATIC_URL = "/static/" +TEMPLATES_DIR_NAME = "templates" +CUSTOM_TEMPLATES_ENABLED = os.path.isdir(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR) and os.access(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR, os.R_OK) STATICFILES_DIRS = [ - *([str(CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_DIR else []), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'static'), + *([str(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR / "static")] if CUSTOM_TEMPLATES_ENABLED else []), + # *[ + # str(plugin_dir / 'static') + # for plugin_dir in PLUGIN_DIRS.values() + # if (plugin_dir / 'static').is_dir() + # ], + # Additional static file dirs from plugins + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "static"), ] TEMPLATE_DIRS = [ - *([str(CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_DIR else []), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'core'), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'admin'), - str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME), + *([str(STORAGE_CONFIG.CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_ENABLED else []), + # *[ + # str(plugin_dir / 'templates') + # for plugin_dir in PLUGIN_DIRS.values() + # if (plugin_dir / 'templates').is_dir() + # ], + # Additional template dirs from plugins + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "core"), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "admin"), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME), ] TEMPLATES = [ { - 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': TEMPLATE_DIRS, - 'APP_DIRS': True, - 'OPTIONS': { - 'context_processors': [ - 'django.template.context_processors.debug', - 'django.template.context_processors.request', - 'django.contrib.auth.context_processors.auth', - 'django.contrib.messages.context_processors.messages', + "BACKEND": "django.template.backends.django.DjangoTemplates", + "DIRS": TEMPLATE_DIRS, + "APP_DIRS": True, + "OPTIONS": { + "context_processors": [ + "django.template.context_processors.debug", + "django.template.context_processors.request", + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", ], }, }, @@ -143,92 +221,206 @@ ### External Service Settings ################################################################################ -DATABASE_FILE = Path(OUTPUT_DIR) / SQL_INDEX_FILENAME -DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(DATABASE_FILE)) +# CACHE_DB_FILENAME = 'cache.sqlite3' +# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME +# CACHE_DB_TABLE = 'django_cache' + +DATABASE_NAME = os.environ.get("ARCHIVEBOX_DATABASE_NAME", str(CONSTANTS.DATABASE_FILE)) + +SQLITE_CONNECTION_OPTIONS = { + "ENGINE": "django.db.backends.sqlite3", + "TIME_ZONE": CONSTANTS.TIMEZONE, + "OPTIONS": { + # https://gcollazo.com/optimal-sqlite-settings-for-django/ + # https://litestream.io/tips/#busy-timeout + # https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options + "timeout": 30, + "check_same_thread": False, + "transaction_mode": "IMMEDIATE", + "init_command": ( + "PRAGMA foreign_keys=ON;" + "PRAGMA busy_timeout = 30000;" + "PRAGMA journal_mode = WAL;" + "PRAGMA synchronous = NORMAL;" + "PRAGMA temp_store = MEMORY;" + "PRAGMA mmap_size = 134217728;" + "PRAGMA journal_size_limit = 67108864;" + "PRAGMA cache_size = 2000;" + ), + }, +} DATABASES = { - 'default': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': DATABASE_NAME, - 'OPTIONS': { - 'timeout': 60, - 'check_same_thread': False, - }, - 'TIME_ZONE': 'UTC', - # DB setup is sometimes modified at runtime by setup_django() in config.py - } + "default": { + "NAME": DATABASE_NAME, + **SQLITE_CONNECTION_OPTIONS, + }, + # "filestore": { + # "NAME": CONSTANTS.FILESTORE_DATABASE_FILE, + # **SQLITE_CONNECTION_OPTIONS, + # }, + # 'cache': { + # 'NAME': CACHE_DB_PATH, + # **SQLITE_CONNECTION_OPTIONS, + # }, } +MIGRATION_MODULES = {"signal_webhooks": None} + +# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.) +# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting +# For now: manually add `id = models.UUIDField(primary_key=True, default=uuid7, ...)` to all models +# OR inherit from ModelWithUUID base class which provides UUID primary key +DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + + +# class FilestoreDBRouter: +# """ +# A router to store all the File models in the filestore.sqlite3 database. +# This data just mirrors what is in the file system, so we want to keep it in a separate database +# from the main index database to avoid contention. +# """ + +# route_app_labels = {"filestore"} +# db_name = "filestore" -CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache' -# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache' -# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache' +# def db_for_read(self, model, **hints): +# if model._meta.app_label in self.route_app_labels: +# return self.db_name +# return 'default' + +# def db_for_write(self, model, **hints): +# if model._meta.app_label in self.route_app_labels: +# return self.db_name +# return 'default' + +# def allow_relation(self, obj1, obj2, **hints): +# if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels: +# return obj1._meta.app_label == obj2._meta.app_label +# return None + +# def allow_migrate(self, db, app_label, model_name=None, **hints): +# if app_label in self.route_app_labels: +# return db == self.db_name +# return db == "default" + +DATABASE_ROUTERS = [] CACHES = { - 'default': { - 'BACKEND': CACHE_BACKEND, - 'LOCATION': 'django_cache_default', - } + "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}, + # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, + # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, + # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, } -EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" +STORAGES = { + "default": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + }, + "staticfiles": { + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + }, + "archive": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + "OPTIONS": { + "base_url": "/archive/", + "location": ARCHIVE_DIR, + }, + }, + # "snapshots": { + # "BACKEND": "django.core.files.storage.FileSystemStorage", + # "OPTIONS": { + # "base_url": "/snapshots/", + # "location": CONSTANTS.SNAPSHOTS_DIR, + # }, + # }, + # "personas": { + # "BACKEND": "django.core.files.storage.FileSystemStorage", + # "OPTIONS": { + # "base_url": "/personas/", + # "location": PERSONAS_DIR, + # }, + # }, +} + +CHANNEL_LAYERS = {"default": {"BACKEND": "channels.layers.InMemoryChannelLayer"}} + ################################################################################ ### Security Settings ################################################################################ -SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_') +SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_") + +ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",") +CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(","))) -ALLOWED_HOSTS = ALLOWED_HOSTS.split(',') +admin_base_url = normalize_base_url(get_admin_base_url()) +if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(admin_base_url) + +api_base_url = normalize_base_url(get_api_base_url()) +if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(api_base_url) + +# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com) +# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS +for hostname in ALLOWED_HOSTS: + https_endpoint = f"https://{hostname}" + if hostname != "*" and https_endpoint not in CSRF_TRUSTED_ORIGINS: + print(f"[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS") + CSRF_TRUSTED_ORIGINS.append(https_endpoint) SECURE_BROWSER_XSS_FILTER = True SECURE_CONTENT_TYPE_NOSNIFF = True -SECURE_REFERRER_POLICY = 'strict-origin-when-cross-origin' +SECURE_REFERRER_POLICY = "strict-origin-when-cross-origin" CSRF_COOKIE_SECURE = False SESSION_COOKIE_SECURE = False +SESSION_COOKIE_HTTPONLY = True SESSION_COOKIE_DOMAIN = None +CSRF_COOKIE_DOMAIN = None SESSION_COOKIE_AGE = 1209600 # 2 weeks SESSION_EXPIRE_AT_BROWSER_CLOSE = False -SESSION_SAVE_EVERY_REQUEST = True +SESSION_SAVE_EVERY_REQUEST = False SESSION_ENGINE = "django.contrib.sessions.backends.db" AUTH_PASSWORD_VALIDATORS = [ - {'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'}, - {'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'}, - {'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'}, - {'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'}, + {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"}, + {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"}, + {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"}, + {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"}, ] +DATA_UPLOAD_MAX_NUMBER_FIELDS = None +DATA_UPLOAD_MAX_MEMORY_SIZE = 26_214_400 # 25MB ################################################################################ ### Shell Settings ################################################################################ -SHELL_PLUS = 'ipython' +SHELL_PLUS = "ipython" SHELL_PLUS_PRINT_SQL = False -IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] -IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' +IPYTHON_ARGUMENTS = ["--no-confirm-exit", "--no-banner"] +IPYTHON_KERNEL_DISPLAY_NAME = "ArchiveBox Django Shell" if IS_SHELL: - os.environ['PYTHONSTARTUP'] = str(Path(PACKAGE_DIR) / 'core' / 'welcome_message.py') + os.environ["PYTHONSTARTUP"] = str(PACKAGE_DIR / "misc" / "shell_welcome_message.py") ################################################################################ ### Internationalization & Localization Settings ################################################################################ -LANGUAGE_CODE = 'en-us' +LANGUAGE_CODE = "en-us" USE_I18N = True -USE_L10N = True USE_TZ = True -DATETIME_FORMAT = 'Y-m-d g:iA' -SHORT_DATETIME_FORMAT = 'Y-m-d h:iA' -TIME_ZONE = TIME_ZONE # noqa +DATETIME_FORMAT = "Y-m-d h:i:s A" +SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A" +TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent -from django.conf.locale.en import formats as en_formats - -en_formats.DATETIME_FORMAT = DATETIME_FORMAT +en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT @@ -236,67 +428,184 @@ ### Logging Settings ################################################################################ -IGNORABLE_404_URLS = [ - re.compile(r'apple-touch-icon.*\.png$'), - re.compile(r'favicon\.ico$'), - re.compile(r'robots\.txt$'), - re.compile(r'.*\.(css|js)\.map$'), -] +LOGGING = SETTINGS_LOGGING + + +################################################################################ +### REST API Outbound Webhooks settings +################################################################################ + +# Add default webhook configuration to the User model +SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook" +SIGNAL_WEBHOOKS: dict[str, object] = { + "HOOKS": { + # ... is a special sigil value that means "use the default autogenerated hooks" + "django.contrib.auth.models.User": ..., + "archivebox.core.models.Snapshot": ..., + "archivebox.core.models.ArchiveResult": ..., + "archivebox.core.models.Tag": ..., + "archivebox.api.models.APIToken": ..., + }, +} -class NoisyRequestsFilter(logging.Filter): - def filter(self, record): - logline = record.getMessage() - - # ignore harmless 404s for the patterns in IGNORABLE_404_URLS - for ignorable_url_pattern in IGNORABLE_404_URLS: - ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M) - if ignorable_log_pattern.match(logline): - return 0 - - # ignore staticfile requests that 200 or 30* - ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M) - if ignoreable_200_log_pattern.match(logline): - return 0 - - return 1 - -if LOGS_DIR.exists(): - ERROR_LOG = (LOGS_DIR / 'errors.log') -else: - # meh too many edge cases here around creating log dir w/ correct permissions - # cant be bothered, just trash the log and let them figure it out via stdout/stderr - ERROR_LOG = tempfile.NamedTemporaryFile().name - -LOGGING = { - 'version': 1, - 'disable_existing_loggers': False, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', +# Avoid background threads touching sqlite connections (especially during tests/migrations). +default_database = cast(dict[str, Any], DATABASES["default"]) +if str(default_database["ENGINE"]).endswith("sqlite3"): + SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler" + +################################################################################ +### Admin Data View Settings +################################################################################ + +ADMIN_DATA_VIEWS = { + "NAME": "Environment", + "URLS": [ + { + "route": "config/", + "view": "archivebox.core.views.live_config_list_view", + "name": "Configuration", + "items": { + "route": "/", + "view": "archivebox.core.views.live_config_value_view", + "name": "config_val", + }, }, - 'logfile': { - 'level': 'ERROR', - 'class': 'logging.handlers.RotatingFileHandler', - 'filename': ERROR_LOG, - 'maxBytes': 1024 * 1024 * 25, # 25 MB - 'backupCount': 10, + { + "route": "binaries/", + "view": "archivebox.config.views.binaries_list_view", + "name": "Dependencies", + "items": { + "route": "/", + "view": "archivebox.config.views.binary_detail_view", + "name": "binary", + }, }, - }, - 'filters': { - 'noisyrequestsfilter': { - '()': NoisyRequestsFilter, - } - }, - 'loggers': { - 'django': { - 'handlers': ['console', 'logfile'], - 'level': 'INFO', - 'filters': ['noisyrequestsfilter'], + { + "route": "plugins/", + "view": "archivebox.config.views.plugins_list_view", + "name": "Plugins", + "items": { + "route": "/", + "view": "archivebox.config.views.plugin_detail_view", + "name": "plugin", + }, }, - 'django.server': { - 'handlers': ['console', 'logfile'], - 'level': 'INFO', - 'filters': ['noisyrequestsfilter'], - } - }, + { + "route": "workers/", + "view": "archivebox.config.views.worker_list_view", + "name": "Workers", + "items": { + "route": "/", + "view": "archivebox.config.views.worker_detail_view", + "name": "worker", + }, + }, + { + "route": "logs/", + "view": "archivebox.config.views.log_list_view", + "name": "Logs", + "items": { + "route": "/", + "view": "archivebox.config.views.log_detail_view", + "name": "log", + }, + }, + # Additional admin data views from plugins + ], } + + +################################################################################ +### Debug Settings +################################################################################ + +# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) +DEBUG_TOOLBAR = False +DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ("--nothreading" in sys.argv) and ("--reload" not in sys.argv) +if DEBUG_TOOLBAR: + try: + import debug_toolbar # noqa + + DEBUG_TOOLBAR = True + except ImportError: + DEBUG_TOOLBAR = False + +if DEBUG_TOOLBAR: + INSTALLED_APPS = [*INSTALLED_APPS, "debug_toolbar"] + INTERNAL_IPS = ["0.0.0.0", "127.0.0.1", "*"] + DEBUG_TOOLBAR_CONFIG = { + "SHOW_TOOLBAR_CALLBACK": lambda request: True, + "RENDER_PANELS": True, + } + DEBUG_TOOLBAR_PANELS = [ + "debug_toolbar.panels.history.HistoryPanel", + "debug_toolbar.panels.versions.VersionsPanel", + "debug_toolbar.panels.timer.TimerPanel", + "debug_toolbar.panels.settings.SettingsPanel", + "debug_toolbar.panels.headers.HeadersPanel", + "debug_toolbar.panels.request.RequestPanel", + "debug_toolbar.panels.sql.SQLPanel", + "debug_toolbar.panels.staticfiles.StaticFilesPanel", + # 'debug_toolbar.panels.templates.TemplatesPanel', + "debug_toolbar.panels.cache.CachePanel", + "debug_toolbar.panels.signals.SignalsPanel", + "debug_toolbar.panels.logging.LoggingPanel", + "debug_toolbar.panels.redirects.RedirectsPanel", + "debug_toolbar.panels.profiling.ProfilingPanel", + "djdt_flamegraph.FlamegraphPanel", + ] + MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"] + +if DEBUG: + try: + import django_autotyping # noqa + except ImportError: + pass + else: + INSTALLED_APPS += ["django_autotyping"] + AUTOTYPING = { + "STUBS_GENERATION": { + "LOCAL_STUBS_DIR": PACKAGE_DIR / "typings", + }, + } + +# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) +# Must delete archivebox/templates/admin to use because it relies on some things we override +# visit /__requests_tracker__/ to access +DEBUG_REQUESTS_TRACKER = True +DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG +if DEBUG_REQUESTS_TRACKER: + import requests_tracker + + INSTALLED_APPS += ["requests_tracker"] + MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"] + INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"] + + TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates")) + + REQUESTS_TRACKER_CONFIG = { + "TRACK_SQL": True, + "ENABLE_STACKTRACES": False, + "IGNORE_PATHS_PATTERNS": ( + r".*/favicon\.ico", + r".*\.png", + r"/admin/jsi18n/", + ), + "IGNORE_SQL_PATTERNS": ( + r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'", + r"^SELECT .* FROM django_migrations WHERE app = 'auth'", + ), + } + +# # https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.) +# DEBUG_LOGFIRE = False +# DEBUG_LOGFIRE = DEBUG_LOGFIRE and os.access(DATA_DIR / '.logfire', os.W_OK) and (DATA_DIR / '.logfire').is_dir() + + +# For usage with https://www.jetadmin.io/integrations/django +# INSTALLED_APPS += ['jet_django'] +# JET_PROJECT = 'archivebox' +# JET_TOKEN = 'some-api-token-here' + + +# import ipdb; ipdb.set_trace() diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py new file mode 100644 index 0000000000..0816fde4f2 --- /dev/null +++ b/archivebox/core/settings_logging.py @@ -0,0 +1,209 @@ +__package__ = "archivebox.core" + +import re +import os +import tempfile +import logging + + +from archivebox.config import CONSTANTS + + +IGNORABLE_URL_PATTERNS = [ + re.compile(r"/.*/?apple-touch-icon.*\.png"), + re.compile(r"/.*/?favicon\.ico"), + re.compile(r"/.*/?robots\.txt"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/static/.*"), + re.compile(r"/admin/jsi18n/"), +] + + +class NoisyRequestsFilter(logging.Filter): + def filter(self, record) -> bool: + logline = record.getMessage() + # '"GET /api/v1/docs HTTP/1.1" 200 1023' + # '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502' + # '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0' + # '"GET /admin/jsi18n/ HTTP/1.1" 200 3352' + # '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778' + + # ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS + for pattern in IGNORABLE_URL_PATTERNS: + ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M) + if ignorable_GET_request.match(logline): + return False + + ignorable_404_pattern = re.compile(f"Not Found: {pattern.pattern}", re.I | re.M) + if ignorable_404_pattern.match(logline): + return False + + return True + + +class CustomOutboundWebhookLogFormatter(logging.Formatter): + def format(self, record): + result = super().format(record) + return result.replace("HTTP Request: ", "OutboundWebhook: ") + + +class StripANSIColorCodesFilter(logging.Filter): + _ansi_re = re.compile(r"\x1b\[[0-9;]*m") + _bare_re = re.compile(r"\[[0-9;]*m") + + def filter(self, record) -> bool: + msg = record.getMessage() + if isinstance(msg, str) and ("\x1b[" in msg or "[m" in msg): + msg = self._ansi_re.sub("", msg) + msg = self._bare_re.sub("", msg) + record.msg = msg + record.args = () + return True + + +ERROR_LOG = tempfile.NamedTemporaryFile().name + +LOGS_DIR = CONSTANTS.LOGS_DIR + +if os.access(LOGS_DIR, os.W_OK) and LOGS_DIR.is_dir(): + ERROR_LOG = LOGS_DIR / "errors.log" +else: + # historically too many edge cases here around creating log dir w/ correct permissions early on + # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr + # print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') + pass + +LOG_LEVEL_DATABASE = "WARNING" # change to DEBUG to log all SQL queries +LOG_LEVEL_REQUEST = "WARNING" # if DEBUG else 'WARNING' + +if LOG_LEVEL_DATABASE == "DEBUG": + db_logger = logging.getLogger("django.db.backends") + db_logger.setLevel(logging.DEBUG) + db_logger.addHandler(logging.StreamHandler()) + + +SETTINGS_LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "rich": { + "datefmt": "[%Y-%m-%d %H:%M:%S]", + "format": "%(name)s %(message)s", + }, + "outbound_webhooks": { + "()": CustomOutboundWebhookLogFormatter, + "datefmt": "[%Y-%m-%d %H:%M:%S]", + }, + }, + "filters": { + "noisyrequestsfilter": { + "()": NoisyRequestsFilter, + }, + "stripansi": { + "()": StripANSIColorCodesFilter, + }, + "require_debug_false": { + "()": "django.utils.log.RequireDebugFalse", + }, + "require_debug_true": { + "()": "django.utils.log.RequireDebugTrue", + }, + }, + "handlers": { + "default": { + "class": "rich.logging.RichHandler", + "formatter": "rich", + "level": "DEBUG", + "markup": False, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) + "filters": ["noisyrequestsfilter", "stripansi"], + }, + "logfile": { + "level": "INFO", + "class": "logging.handlers.RotatingFileHandler", + "filename": ERROR_LOG, + "maxBytes": 1024 * 1024 * 25, # 25 MB + "backupCount": 10, + "formatter": "rich", + "filters": ["noisyrequestsfilter", "stripansi"], + }, + "outbound_webhooks": { + "class": "rich.logging.RichHandler", + "markup": False, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) + "formatter": "outbound_webhooks", + }, + # "mail_admins": { + # "level": "ERROR", + # "filters": ["require_debug_false"], + # "class": "django.utils.log.AdminEmailHandler", + # }, + "null": { + "class": "logging.NullHandler", + }, + }, + "root": { + "handlers": ["default", "logfile"], + "level": "INFO", + "formatter": "rich", + }, + "loggers": { + "api": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "checks": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "core": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "httpx": { + "handlers": ["outbound_webhooks"], + "level": "INFO", + "formatter": "outbound_webhooks", + "propagate": False, + }, + "django": { + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + "propagate": False, + }, + "django.utils.autoreload": { + "propagate": False, + "handlers": [], + "level": "ERROR", + }, + "django.channels.server": { + # see archivebox.misc.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.server": { # logs all requests (2xx, 3xx, 4xx) + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.request": { # only logs 4xx and 5xx errors + "propagate": False, + "handlers": ["default", "logfile"], + "level": "ERROR", + "filters": ["noisyrequestsfilter"], + }, + "django.db.backends": { + "propagate": False, + "handlers": ["default"], + "level": LOG_LEVEL_DATABASE, + }, + }, +} diff --git a/archivebox/core/tag_utils.py b/archivebox/core/tag_utils.py new file mode 100644 index 0000000000..d0efd427d1 --- /dev/null +++ b/archivebox/core/tag_utils.py @@ -0,0 +1,270 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from typing import Any + +from django.contrib.auth.models import User +from django.db.models import Count, F, Q, QuerySet +from django.db.models.functions import Lower +from django.http import HttpRequest +from django.urls import reverse + +from archivebox.core.host_utils import build_snapshot_url, build_web_url +from archivebox.core.models import Snapshot, SnapshotTag, Tag + + +TAG_SNAPSHOT_PREVIEW_LIMIT = 10 +TAG_SORT_CHOICES = ( + ("name_asc", "Name A-Z"), + ("name_desc", "Name Z-A"), + ("created_desc", "Created newest"), + ("created_asc", "Created oldest"), + ("snapshots_desc", "Most snapshots"), + ("snapshots_asc", "Fewest snapshots"), +) +TAG_HAS_SNAPSHOTS_CHOICES = ( + ("all", "All"), + ("yes", "Has snapshots"), + ("no", "No snapshots"), +) + + +def normalize_tag_name(name: str) -> str: + return (name or "").strip() + + +def normalize_tag_sort(sort: str = "created_desc") -> str: + valid_sorts = {key for key, _label in TAG_SORT_CHOICES} + return sort if sort in valid_sorts else "created_desc" + + +def normalize_has_snapshots_filter(value: str = "all") -> str: + valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES} + return value if value in valid_filters else "all" + + +def normalize_created_by_filter(created_by: str = "") -> str: + return created_by if str(created_by).isdigit() else "" + + +def normalize_created_year_filter(year: str = "") -> str: + year = (year or "").strip() + return year if len(year) == 4 and year.isdigit() else "" + + +def get_matching_tags( + query: str = "", + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +) -> QuerySet[Tag]: + queryset = Tag.objects.select_related("created_by").annotate( + num_snapshots=Count("snapshot_set", distinct=True), + ) + + query = normalize_tag_name(query) + if query: + queryset = queryset.filter( + Q(name__icontains=query) | Q(slug__icontains=query), + ) + + created_by = normalize_created_by_filter(created_by) + if created_by: + queryset = queryset.filter(created_by_id=int(created_by)) + + year = normalize_created_year_filter(year) + if year: + queryset = queryset.filter(created_at__year=int(year)) + + has_snapshots = normalize_has_snapshots_filter(has_snapshots) + if has_snapshots == "yes": + queryset = queryset.filter(num_snapshots__gt=0) + elif has_snapshots == "no": + queryset = queryset.filter(num_snapshots=0) + + sort = normalize_tag_sort(sort) + if sort == "name_asc": + queryset = queryset.order_by(Lower("name"), "id") + elif sort == "name_desc": + queryset = queryset.order_by(Lower("name").desc(), "-id") + elif sort == "created_asc": + queryset = queryset.order_by(F("created_at").asc(nulls_first=True), "id", Lower("name")) + elif sort == "snapshots_desc": + queryset = queryset.order_by(F("num_snapshots").desc(nulls_last=True), F("created_at").desc(nulls_last=True), "-id", Lower("name")) + elif sort == "snapshots_asc": + queryset = queryset.order_by(F("num_snapshots").asc(nulls_first=True), Lower("name"), "id") + else: + queryset = queryset.order_by(F("created_at").desc(nulls_last=True), "-id", Lower("name")) + + return queryset + + +def get_tag_creator_choices() -> list[tuple[str, str]]: + rows = ( + Tag.objects.filter(created_by__isnull=False) + .values_list("created_by_id", "created_by__username") + .order_by(Lower("created_by__username"), "created_by_id") + .distinct() + ) + return [(str(user_id), username or f"User {user_id}") for user_id, username in rows] + + +def get_tag_year_choices() -> list[str]: + years = Tag.objects.exclude(created_at__isnull=True).dates("created_at", "year", order="DESC") + return [str(year.year) for year in years] + + +def get_tag_by_ref(tag_ref: str | int) -> Tag: + if isinstance(tag_ref, int): + return Tag.objects.get(pk=tag_ref) + + ref = str(tag_ref).strip() + if ref.isdigit(): + return Tag.objects.get(pk=int(ref)) + + try: + return Tag.objects.get(slug__iexact=ref) + except Tag.DoesNotExist: + return Tag.objects.get(slug__icontains=ref) + + +def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError("Tag name is required") + + existing = Tag.objects.filter(name__iexact=normalized_name).first() + if existing: + return existing, False + + tag = Tag.objects.create( + name=normalized_name, + created_by=created_by, + ) + return tag, True + + +def rename_tag(tag: Tag, name: str) -> Tag: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError("Tag name is required") + + existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first() + if existing: + raise ValueError(f'Tag "{existing.name}" already exists') + + if tag.name != normalized_name: + tag.name = normalized_name + tag.save() + return tag + + +def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]: + return tag.delete() + + +def export_tag_urls(tag: Tag) -> str: + urls = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").values_list("url", flat=True) + return "\n".join(urls) + + +def export_tag_snapshots_jsonl(tag: Tag) -> str: + snapshots = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").prefetch_related("tags") + return "\n".join(json.dumps(snapshot.to_json()) for snapshot in snapshots) + + +def _display_snapshot_title(snapshot: Snapshot) -> str: + title = (snapshot.title or "").strip() + url = (snapshot.url or "").strip() + if not title: + return url + + normalized_title = title.lower() + if normalized_title == "pending..." or normalized_title == url.lower(): + return url + return title + + +def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]: + return { + "id": str(snapshot.pk), + "title": _display_snapshot_title(snapshot), + "url": snapshot.url, + "favicon_url": build_snapshot_url(str(snapshot.pk), "favicon.ico", request=request), + "admin_url": reverse("admin:core_snapshot_change", args=[snapshot.pk]), + "archive_url": build_web_url(f"/{snapshot.archive_path_from_db}/index.html", request=request), + "downloaded_at": snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None, + } + + +def _build_snapshot_preview_map( + tags: list[Tag], + request: HttpRequest | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, +) -> dict[int, list[dict[str, Any]]]: + tag_ids = [tag.pk for tag in tags] + if not tag_ids: + return {} + + snapshot_tags = ( + SnapshotTag.objects.filter(tag_id__in=tag_ids) + .select_related("snapshot__crawl__created_by") + .order_by( + "tag_id", + F("snapshot__downloaded_at").desc(nulls_last=True), + F("snapshot__created_at").desc(nulls_last=True), + F("snapshot_id").desc(), + ) + ) + + preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list) + for snapshot_tag in snapshot_tags: + previews = preview_map[snapshot_tag.tag_id] + if len(previews) >= preview_limit: + continue + previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request)) + return preview_map + + +def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]: + count = getattr(tag, "num_snapshots", tag.snapshot_set.count()) + return { + "id": tag.pk, + "name": tag.name, + "slug": tag.slug, + "num_snapshots": count, + "filter_url": f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}", + "edit_url": reverse("admin:core_tag_change", args=[tag.pk]), + "export_urls_url": reverse("api-1:tag_urls_export", args=[tag.pk]), + "export_jsonl_url": reverse("api-1:tag_snapshots_export", args=[tag.pk]), + "rename_url": reverse("api-1:rename_tag", args=[tag.pk]), + "delete_url": reverse("api-1:delete_tag", args=[tag.pk]), + "snapshots": snapshot_previews or [], + } + + +def build_tag_cards( + query: str = "", + request: HttpRequest | None = None, + limit: int | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +) -> list[dict[str, Any]]: + queryset = get_matching_tags( + query=query, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ) + if limit is not None: + queryset = queryset[:limit] + + tags = list(queryset) + preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit) + return [build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, [])) for tag in tags] diff --git a/archivebox/core/templatetags/config_tags.py b/archivebox/core/templatetags/config_tags.py new file mode 100644 index 0000000000..35e3d45c61 --- /dev/null +++ b/archivebox/core/templatetags/config_tags.py @@ -0,0 +1,22 @@ +"""Template tags for accessing config values in templates.""" + +from typing import Any + +from django import template + +from archivebox.config.configset import get_config as _get_config + +register = template.Library() + + +@register.simple_tag +def get_config(key: str) -> Any: + """ + Get a config value by key. + + Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %} + """ + try: + return _get_config().get(key) + except (KeyError, AttributeError): + return None diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index 4f53ac2a78..fb1730a2f2 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -1,23 +1,281 @@ +from typing import Any + from django import template from django.contrib.admin.templatetags.base import InclusionAdminNode +from django.utils.safestring import mark_safe +from django.utils.html import escape +from pathlib import Path -from typing import Union +from archivebox.hooks import ( + get_plugin_icon, + get_plugin_template, + get_plugin_name, +) +from archivebox.core.host_utils import ( + get_admin_base_url, + get_public_base_url, + get_web_base_url, + get_snapshot_base_url, + build_snapshot_url, +) register = template.Library() -@register.filter(name='split') -def split(value, separator: str=','): - return (value or '').split(separator) +_TEXT_PREVIEW_EXTS = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") +_IMAGE_PREVIEW_EXTS = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".ico", ".avif") + +_MEDIA_FILE_EXTS = { + ".mp4", + ".webm", + ".mkv", + ".avi", + ".mov", + ".flv", + ".wmv", + ".m4v", + ".mpg", + ".mpeg", + ".ts", + ".m2ts", + ".mts", + ".3gp", + ".3g2", + ".ogv", + ".mp3", + ".m4a", + ".aac", + ".ogg", + ".oga", + ".opus", + ".wav", + ".flac", + ".alac", + ".aiff", + ".wma", + ".mka", + ".ac3", + ".eac3", + ".dts", +} + + +def _normalize_output_files(output_files: Any) -> dict[str, dict[str, Any]]: + if isinstance(output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in output_files.items(): + if not path: + continue + normalized[str(path)] = dict(metadata) if isinstance(metadata, dict) else {} + return normalized + return {} + + +def _coerce_output_file_size(value: Any) -> int | None: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return None + + +def _count_media_files(result) -> int: + try: + output_files = _normalize_output_files(getattr(result, "output_files", None) or {}) + except Exception: + output_files = {} + + if output_files: + return sum(1 for path in output_files.keys() if Path(path).suffix.lower() in _MEDIA_FILE_EXTS) + + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + except Exception: + return 0 + + if not plugin_dir.exists(): + return 0 + + count = 0 + scanned = 0 + max_scan = 500 + for file_path in plugin_dir.rglob("*"): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + count += 1 + return count + + +def _list_media_files(result) -> list[dict]: + media_files: list[dict] = [] + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + except Exception: + return media_files + + output_files = _normalize_output_files(getattr(result, "output_files", None) or {}) + candidates: list[tuple[Path, int | None]] = [] + if output_files: + for path, metadata in output_files.items(): + rel_path = Path(path) + if rel_path.suffix.lower() in _MEDIA_FILE_EXTS: + candidates.append((rel_path, _coerce_output_file_size(metadata.get("size")))) + + if not candidates and plugin_dir.exists(): + scanned = 0 + max_scan = 2000 + for file_path in plugin_dir.rglob("*"): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + try: + rel_path = file_path.relative_to(plugin_dir) + except ValueError: + continue + try: + size = file_path.stat().st_size + except OSError: + size = None + candidates.append((rel_path, size)) + + for rel_path, size in candidates: + href = str(Path(result.plugin) / rel_path) + media_files.append( + { + "name": rel_path.name, + "path": href, + "size": size, + }, + ) + + media_files.sort(key=lambda item: item["name"].lower()) + return media_files + + +def _resolve_snapshot_output_file(snapshot_dir: str | Path | None, raw_output_path: str | None) -> Path | None: + if not snapshot_dir or not raw_output_path or str(raw_output_path).strip() in (".", "/", "./"): + return None + + output_file = Path(raw_output_path) + if not output_file.is_absolute(): + output_file = Path(snapshot_dir) / raw_output_path + + try: + output_file = output_file.resolve() + snap_dir = Path(snapshot_dir).resolve() + if snap_dir not in output_file.parents and output_file != snap_dir: + return None + except Exception: + return None + + if output_file.exists() and output_file.is_file(): + return output_file + return None + + +def _is_text_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_TEXT_PREVIEW_EXTS) + + +def _is_image_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_IMAGE_PREVIEW_EXTS) + + +def _is_root_snapshot_output_path(raw_output_path: str | None) -> bool: + normalized = str(raw_output_path or "").strip().lower() + return normalized in ("", ".", "./", "/", "index.html", "index.json") + + +def _build_snapshot_files_url(snapshot_id: str, request=None) -> str: + return build_snapshot_url(str(snapshot_id), "/?files=1", request=request) + + +def _build_snapshot_preview_url(snapshot_id: str, path: str = "", request=None) -> str: + if path == "about:blank": + return path + if _is_root_snapshot_output_path(path): + return _build_snapshot_files_url(snapshot_id, request=request) + url = build_snapshot_url(str(snapshot_id), path, request=request) + if not (_is_text_preview_path(path) or _is_image_preview_path(path)): + return url + separator = "&" if "?" in url else "?" + return f"{url}{separator}preview=1" + + +def _render_text_preview(plugin: str, icon_html: str, snippet: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + escaped = escape(snippet) + return ( + f'
' + f'
' + f'{icon_html}' + f'{plugin_label}' + f"
" + f'
{escaped}
' + f"
" + ) + + +def _render_fallback_card(plugin: str, icon_html: str, fallback_label: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + fallback_attr = escape(fallback_label) + return ( + f'
' + f'{icon_html}' + f'{plugin_label}' + f'{fallback_attr}' + f"
" + ) + + +def _render_text_file_preview(snapshot_dir: str | Path | None, raw_output_path: str | None, plugin: str, icon_html: str) -> str | None: + output_file = _resolve_snapshot_output_file(snapshot_dir, raw_output_path) + if not output_file: + return None + + try: + with output_file.open("rb") as f: + raw = f.read(4096) + text = raw.decode("utf-8", errors="replace").strip() + if not text: + return None + lines = text.splitlines()[:6] + snippet = "\n".join(lines) + return _render_text_preview(plugin, icon_html, snippet) + except Exception: + return None + + +@register.filter(name="split") +def split(value, separator: str = ","): + return (value or "").split(separator) + + +@register.filter(name="index") +def index(value, position): + try: + return value[int(position)] + except Exception: + return None + @register.filter -def file_size(num_bytes: Union[int, float]) -> str: - for count in ['Bytes','KB','MB','GB']: +def file_size(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: if num_bytes > -1024.0 and num_bytes < 1024.0: - return '%3.1f %s' % (num_bytes, count) + return f"{num_bytes:3.1f} {count}" num_bytes /= 1024.0 - return '%3.1f %s' % (num_bytes, 'TB') + return "{:3.1f} {}".format(num_bytes, "TB") + def result_list(cl): """ @@ -25,16 +283,236 @@ def result_list(cl): """ num_sorted_fields = 0 return { - 'cl': cl, - 'num_sorted_fields': num_sorted_fields, - 'results': cl.result_list, + "cl": cl, + "num_sorted_fields": num_sorted_fields, + "results": cl.result_list, } -@register.tag(name='snapshots_grid') + +@register.tag(name="snapshots_grid") def result_list_tag(parser, token): return InclusionAdminNode( - parser, token, + parser, + token, func=result_list, - template_name='snapshots_grid.html', + template_name="snapshots_grid.html", takes_context=False, ) + + +@register.simple_tag(takes_context=True) +def url_replace(context, **kwargs): + dict_ = context["request"].GET.copy() + dict_.update(**kwargs) + return dict_.urlencode() + + +@register.simple_tag(takes_context=True) +def admin_base_url(context) -> str: + return get_admin_base_url(request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def web_base_url(context) -> str: + return get_web_base_url(request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def public_base_url(context) -> str: + return get_public_base_url(request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def snapshot_base_url(context, snapshot) -> str: + snapshot_id = getattr(snapshot, "id", snapshot) + return get_snapshot_base_url(str(snapshot_id), request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def snapshot_url(context, snapshot, path: str = "") -> str: + snapshot_id = getattr(snapshot, "id", snapshot) + return build_snapshot_url(str(snapshot_id), path, request=context.get("request")) + + +@register.simple_tag(takes_context=True) +def snapshot_preview_url(context, snapshot, path: str = "") -> str: + snapshot_id = getattr(snapshot, "id", snapshot) + return _build_snapshot_preview_url(str(snapshot_id), path, request=context.get("request")) + + +@register.simple_tag +def plugin_icon(plugin: str) -> str: + """ + Render the icon for a plugin. + + Usage: {% plugin_icon "screenshot" %} + """ + icon_html = get_plugin_icon(plugin) + return mark_safe( + f'{icon_html}', + ) + + +@register.simple_tag(takes_context=True) +def plugin_card(context, result) -> str: + """ + Render the card template for an archive result. + + Usage: {% plugin_card result %} + + Context variables passed to template: + - result: ArchiveResult object + - snapshot: Parent Snapshot object + - output_path: Path to output relative to snapshot dir (from embed_path()) + - plugin: Plugin base name + """ + if result is None or not hasattr(result, "plugin"): + return "" + + plugin = get_plugin_name(result.plugin) + template_str = get_plugin_template(plugin, "card") + + # Use embed_path() for the display path + raw_output_path = result.embed_path() if hasattr(result, "embed_path") else "" + output_url = build_snapshot_url( + str(getattr(result, "snapshot_id", "")), + raw_output_path or "", + request=context.get("request"), + ) + + icon_html = get_plugin_icon(plugin) + plugin_lower = (plugin or "").lower() + media_file_count = _count_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else 0 + media_files = _list_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else [] + if media_files: + snapshot_id = str(getattr(result, "snapshot_id", "")) + request = context.get("request") + for item in media_files: + path = item.get("path") or "" + item["url"] = build_snapshot_url(snapshot_id, path, request=request) if path else "" + + output_lower = (raw_output_path or "").lower() + force_text_preview = output_lower.endswith(_TEXT_PREVIEW_EXTS) + + # Create a mini template and render it with context + try: + if template_str and raw_output_path and str(raw_output_path).strip() not in (".", "/", "./") and not force_text_preview: + tpl = template.Template(template_str) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + "plugin_icon": icon_html, + "media_file_count": media_file_count, + "media_files": media_files, + }, + ) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + except Exception: + pass + + if force_text_preview: + preview = _render_text_file_preview(getattr(result, "snapshot_dir", None), raw_output_path, plugin, icon_html) + if preview: + return mark_safe(preview) + + if output_lower.endswith(_TEXT_PREVIEW_EXTS): + fallback_label = "text" + else: + fallback_label = "output" + + return mark_safe(_render_fallback_card(plugin, icon_html, fallback_label)) + + +@register.simple_tag +def output_card(snapshot, output_path: str, plugin: str) -> str: + plugin_name = get_plugin_name(plugin) + icon_html = get_plugin_icon(plugin_name) + preview = _render_text_file_preview(getattr(snapshot, "output_dir", None), output_path, plugin_name, icon_html) + if preview: + return mark_safe(preview) + + output_lower = (output_path or "").lower() + fallback_label = "text" if output_lower.endswith(_TEXT_PREVIEW_EXTS) else "output" + return mark_safe(_render_fallback_card(plugin_name, icon_html, fallback_label)) + + +@register.simple_tag(takes_context=True) +def plugin_full(context, result) -> str: + """ + Render the full template for an archive result. + + Usage: {% plugin_full result %} + """ + if result is None or not hasattr(result, "plugin"): + return "" + + plugin = get_plugin_name(result.plugin) + template_str = get_plugin_template(plugin, "full") + + if not template_str: + return "" + + raw_output_path = "" + if hasattr(result, "embed_path_db"): + raw_output_path = result.embed_path_db() or "" + if not raw_output_path and hasattr(result, "embed_path"): + raw_output_path = result.embed_path() or "" + if _is_root_snapshot_output_path(raw_output_path): + return "" + output_url = build_snapshot_url( + str(getattr(result, "snapshot_id", "")), + raw_output_path, + request=context.get("request"), + ) + + try: + tpl = template.Template(template_str) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + }, + ) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + return "" + except Exception: + return "" + + +@register.filter +def plugin_name(value: str) -> str: + """ + Get the base name of a plugin (strips numeric prefix). + + Usage: {{ result.plugin|plugin_name }} + """ + return get_plugin_name(value) + + +@register.simple_tag(takes_context=True) +def api_token(context) -> str: + """ + Return an API token string for the logged-in user, creating one if needed. + """ + from archivebox.api.auth import get_or_create_api_token + + request = context.get("request") + user = getattr(request, "user", None) + if not user or not user.is_authenticated: + return "" + + token = get_or_create_api_token(user) + return token.token if token else "" diff --git a/archivebox/core/tests.py b/archivebox/core/tests.py deleted file mode 100644 index 4d66077c6d..0000000000 --- a/archivebox/core/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -#from django.test import TestCase - -# Create your tests here. diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index 87a302b817..85a5bb8544 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -1,48 +1,89 @@ -from django.contrib import admin +__package__ = "archivebox.core" -from django.urls import path, include +from django.urls import path, re_path, include from django.views import static -from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.conf import settings from django.views.generic.base import RedirectView +from django.http import HttpRequest -from core.views import HomepageView, SnapshotView, PublicIndexView, AddView +from archivebox.misc.serve_static import serve_static +from archivebox.core.admin_site import archivebox_admin +from archivebox.core.views import ( + HomepageView, + SnapshotView, + SnapshotPathView, + SnapshotReplayView, + OriginalDomainReplayView, + PublicIndexView, + AddView, + WebAddView, + HealthCheckView, + live_progress_view, +) -# print('DEBUG', settings.DEBUG) -urlpatterns = [ - path('public/', PublicIndexView.as_view(), name='public-index'), +# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 +# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE +# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} - path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}), - path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}), - path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), +# print('DEBUG', settings.DEBUG) - path('archive/', RedirectView.as_view(url='/')), - path('archive/', SnapshotView.as_view(), name='Snapshot'), +urlpatterns = [ + re_path(r"^static/(?P.*)$", serve_static), + # re_path(r"^media/(?P.*)$", static.serve, {"document_root": settings.MEDIA_ROOT}), + path("robots.txt", static.serve, {"document_root": settings.STATICFILES_DIRS[0], "path": "robots.txt"}), + path("favicon.ico", static.serve, {"document_root": settings.STATICFILES_DIRS[0], "path": "favicon.ico"}), + path("docs/", RedirectView.as_view(url="https://github.com/ArchiveBox/ArchiveBox/wiki"), name="Docs"), + path("public/", PublicIndexView.as_view(), name="public-index"), + path("public.html", RedirectView.as_view(url="/public/"), name="public-index-html"), + path("archive/", RedirectView.as_view(url="/")), + path("archive/", SnapshotView.as_view(), name="Snapshot"), + re_path(r"^snapshot\/(?P[0-9a-fA-F-]{8,36})(?:\/(?P.*))?$", SnapshotReplayView.as_view(), name="snapshot-replay"), + re_path(r"^original\/(?P[^/]+)(?:\/(?P.*))?$", OriginalDomainReplayView.as_view(), name="original-replay"), + re_path(r"^web/(?P(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$", WebAddView.as_view(), name="web-add"), + re_path( + r"^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?Phttps?://.*)$", + SnapshotPathView.as_view(), + name="snapshot-path-url", + ), + re_path( + r"^(?P[^/]+)/(?P\d{4}(?:\d{2})?(?:\d{2})?)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path", + ), + re_path(r"^(?P[^/]+)/(?Phttps?://.*)$", SnapshotPathView.as_view(), name="snapshot-path-url-nodate"), + re_path( + r"^(?P[^/]+)/(?P[^/]+)(?:/(?P[0-9a-fA-F-]{8,36})(?:/(?P.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path-nodate", + ), + path("admin/core/snapshot/add/", RedirectView.as_view(url="/add/")), + path("add/", AddView.as_view(), name="add"), + path("accounts/login/", RedirectView.as_view(url="/admin/login/")), + path("accounts/logout/", RedirectView.as_view(url="/admin/logout/")), + path("accounts/", include("django.contrib.auth.urls")), + path("admin/live-progress/", live_progress_view, name="live_progress"), + path("admin/", archivebox_admin.urls), + path("api/", include("archivebox.api.urls"), name="api"), + path("health/", HealthCheckView.as_view(), name="healthcheck"), + path("error/", lambda request: _raise_test_error(request)), + # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django + path("index.html", RedirectView.as_view(url="/")), + path("", HomepageView.as_view(), name="Home"), +] - path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')), - path('add/', AddView.as_view(), name='add'), - - path('accounts/login/', RedirectView.as_view(url='/admin/login/')), - path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')), +def _raise_test_error(_request: HttpRequest): + raise ZeroDivisionError("Intentional test error route") - path('accounts/', include('django.contrib.auth.urls')), - path('admin/', admin.site.urls), - - path('index.html', RedirectView.as_view(url='/')), - path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}), - path('', HomepageView.as_view(), name='Home'), -] -urlpatterns += staticfiles_urlpatterns() if settings.DEBUG_TOOLBAR: - import debug_toolbar - urlpatterns += [ - path('__debug__/', include(debug_toolbar.urls)), - ] + urlpatterns += [path("__debug__/", include("debug_toolbar.urls"))] + +if settings.DEBUG_REQUESTS_TRACKER: + urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))] # # Proposed FUTURE URLs spec @@ -54,7 +95,7 @@ # path('/admin', admin.site.urls) # path('/accounts', django.contrib.auth.urls) -# # Prposed REST API spec +# # Proposed REST API spec # # :slugs can be uuid, short_uuid, or any of the unique index_fields # path('api/v1/'), # path('api/v1/core/' [GET]) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index c056cd65e3..999ea5d662 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,68 +1,314 @@ -__package__ = 'archivebox.core' +__package__ = "archivebox.core" -from io import StringIO -from contextlib import redirect_stdout +import json +import os +import posixpath +from glob import glob, escape +from django.utils import timezone +import inspect +from typing import cast, get_type_hints +from collections.abc import Callable +from pathlib import Path +from urllib.parse import quote, urlparse from django.shortcuts import render, redirect -from django.http import HttpResponse, Http404 -from django.utils.html import format_html, mark_safe -from django.views import View, static +from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django.views import View from django.views.generic.list import ListView from django.views.generic import FormView from django.db.models import Q +from django.contrib import messages from django.contrib.auth.mixins import UserPassesTestMixin +from django.views.decorators.csrf import csrf_exempt +from django.utils.decorators import method_decorator -from core.models import Snapshot -from core.forms import AddLinkForm - -from ..config import ( - OUTPUT_DIR, - PUBLIC_INDEX, - PUBLIC_SNAPSHOTS, - PUBLIC_ADD_VIEW, - VERSION, - FOOTER_INFO, - SNAPSHOTS_PER_PAGE, +from admin_data_views.typing import TableContext, ItemContext, SectionData +from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink + +from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG +from archivebox.config.configset import get_flat_config, get_config, get_all_configs +from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode, without_fragment +from archivebox.misc.serve_static import serve_static_with_byterange_support +from archivebox.misc.logging_util import printable_filesize +from archivebox.search import get_search_mode, prioritize_metadata_matches, query_search_index + +from archivebox.core.models import Snapshot +from archivebox.core.host_utils import ( + build_admin_url, + build_snapshot_url, + build_web_url, + get_admin_host, + get_snapshot_host, + get_snapshot_lookup_key, + get_web_host, + host_matches, +) +from archivebox.core.forms import AddLinkForm +from archivebox.crawls.models import Crawl +from archivebox.hooks import ( + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + discover_plugin_configs, + get_enabled_plugins, + get_plugin_name, + iter_plugin_dirs, ) -from ..main import add -from ..util import base_url, ansi_to_html -from ..search import query_search_index + + +ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/" +LIVE_PLUGIN_BASE_URL = "/admin/environment/plugins/" + + +def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: + target = archivefile or "" + if target == "index.html": + target = "" + fullpath = Path(snapshot.output_dir) / target + if fullpath.is_file(): + target = str(Path(target).parent) + if target == ".": + target = "" + return target + + +def _find_snapshot_by_ref(snapshot_ref: str) -> Snapshot | None: + lookup = get_snapshot_lookup_key(snapshot_ref) + if not lookup: + return None + + if len(lookup) == 12 and "-" not in lookup: + return Snapshot.objects.filter(id__endswith=lookup).order_by("-created_at", "-downloaded_at").first() + + try: + return Snapshot.objects.get(pk=lookup) + except Snapshot.DoesNotExist: + try: + return Snapshot.objects.get(id__startswith=lookup) + except Snapshot.DoesNotExist: + return None + except Snapshot.MultipleObjectsReturned: + return Snapshot.objects.filter(id__startswith=lookup).first() + + +def _admin_login_redirect_or_forbidden(request: HttpRequest): + if SERVER_CONFIG.CONTROL_PLANE_ENABLED: + return redirect(f"/admin/login/?next={request.path}") + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") class HomepageView(View): def get(self, request): - if request.user.is_authenticated: - return redirect('/admin/core/snapshot/') + if request.user.is_authenticated and SERVER_CONFIG.CONTROL_PLANE_ENABLED: + return redirect("/admin/core/snapshot/") + + if SERVER_CONFIG.PUBLIC_INDEX: + return redirect("/public") - if PUBLIC_INDEX: - return redirect('/public') - - return redirect(f'/admin/login/?next={request.path}') + return _admin_login_redirect_or_forbidden(request) class SnapshotView(View): # render static html index from filesystem archive//index.html + @staticmethod + def find_snapshots_for_url(path: str): + """Return a queryset of snapshots matching a URL-ish path.""" + + def _fragmentless_url_query(url: str) -> Q: + canonical = without_fragment(url) + return Q(url=canonical) | Q(url__startswith=f"{canonical}#") + + normalized = without_fragment(path) + if path.startswith(("http://", "https://")): + # try exact match on full url / ID first + qs = Snapshot.objects.filter(_fragmentless_url_query(path) | Q(id__icontains=path) | Q(id__icontains=normalized)) + if qs.exists(): + return qs + normalized = normalized.split("://", 1)[1] + + # try exact match on full url / ID (without scheme) + qs = Snapshot.objects.filter( + _fragmentless_url_query("http://" + normalized) + | _fragmentless_url_query("https://" + normalized) + | Q(id__icontains=normalized), + ) + if qs.exists(): + return qs + + # fall back to match on exact base_url + base = base_url(normalized) + qs = Snapshot.objects.filter( + _fragmentless_url_query("http://" + base) | _fragmentless_url_query("https://" + base), + ) + if qs.exists(): + return qs + + # fall back to matching base_url as prefix + return Snapshot.objects.filter(Q(url__startswith="http://" + base) | Q(url__startswith="https://" + base)) + + @staticmethod + def render_live_index(request, snapshot): + TITLE_LOADING_MSG = "Not yet archived..." + from archivebox.core.widgets import TagEditorWidget + + hidden_card_plugins = {"archivedotorg", "favicon", "title"} + outputs = [ + out + for out in snapshot.discover_outputs(include_filesystem_fallback=True) + if (out.get("size") or 0) > 0 and out.get("name") not in hidden_card_plugins + ] + archiveresults = {out["name"]: out for out in outputs} + hash_index = snapshot.hashes_index + # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) + # Convert to base names for display ordering + all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()] + accounted_entries: set[str] = set() + for output in outputs: + output_name = output.get("name") or "" + if output_name: + accounted_entries.add(output_name) + output_path = output.get("path") or "" + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs, hidden_card_plugins=hidden_card_plugins) + preview_priority = [ + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", + ] + preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority]) + all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) + + best_result = {"path": "about:blank", "result": None} + for result_type in preferred_types: + if result_type in archiveresults: + best_result = archiveresults[result_type] + break + + related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url) + related_snapshots = list(related_snapshots_qs.exclude(id=snapshot.id).order_by("-bookmarked_at", "-created_at", "-timestamp")[:25]) + related_years_map: dict[int, list[Snapshot]] = {} + for snap in [snapshot, *related_snapshots]: + snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at + if not snap_dt: + continue + related_years_map.setdefault(snap_dt.year, []).append(snap) + related_years = [] + for year, snaps in related_years_map.items(): + snaps_sorted = sorted( + snaps, + key=lambda s: s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now(), + reverse=True, + ) + related_years.append( + { + "year": year, + "latest": snaps_sorted[0], + "snapshots": snaps_sorted, + }, + ) + related_years.sort(key=lambda item: item["year"], reverse=True) + + warc_path = next( + (rel_path for rel_path in hash_index if rel_path.startswith("warc/") and ".warc" in Path(rel_path).name), + "warc/", + ) + + ordered_outputs = sorted( + archiveresults.values(), + key=lambda r: all_types.index(r["name"]) if r["name"] in all_types else -r["size"], + ) + non_compact_outputs = [out for out in ordered_outputs if not out.get("is_compact") and not out.get("is_metadata")] + compact_outputs = [out for out in ordered_outputs if out.get("is_compact") or out.get("is_metadata")] + tag_widget = TagEditorWidget() + output_size = sum(int(out.get("size") or 0) for out in ordered_outputs) + is_archived = bool(ordered_outputs or snapshot.downloaded_at or snapshot.status == Snapshot.StatusChoices.SEALED) + + context = { + "id": str(snapshot.id), + "snapshot_id": str(snapshot.id), + "url": snapshot.url, + "archive_path": snapshot.archive_path_from_db, + "title": htmlencode(snapshot.resolved_title or (snapshot.base_url if is_archived else TITLE_LOADING_MSG)), + "extension": snapshot.extension or "html", + "tags": snapshot.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "pending", + "status": "archived" if is_archived else "not yet archived", + "status_color": "success" if is_archived else "danger", + "bookmarked_date": snapshot.bookmarked_date, + "downloaded_datestr": snapshot.downloaded_datestr, + "num_outputs": snapshot.num_outputs, + "num_failures": snapshot.num_failures, + "oldest_archive_date": ts_to_date_str(snapshot.oldest_archive_date), + "warc_path": warc_path, + "PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS, + "archiveresults": [*non_compact_outputs, *compact_outputs], + "best_result": best_result, + "snapshot": snapshot, # Pass the snapshot object for template tags + "related_snapshots": related_snapshots, + "related_years": related_years, + "loose_items": loose_items, + "failed_items": failed_items, + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in snapshot.tags.all().order_by("name")], + } + return render(template_name="core/snapshot.html", request=request, context=context) + def get(self, request, path): - if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: - return redirect(f'/admin/login/?next={request.path}') + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + + snapshot = None try: - slug, archivefile = path.split('/', 1) + slug, archivefile = path.split("/", 1) except (IndexError, ValueError): - slug, archivefile = path.split('/', 1)[0], 'index.html' + slug, archivefile = path.split("/", 1)[0], "index.html" # slug is a timestamp - if slug.replace('.','').isdigit(): - + if slug.replace(".", "").isdigit(): # missing trailing slash -> redirect to index - if '/' not in path: - return redirect(f'{path}/index.html') + if "/" not in path: + return redirect(f"{path}/index.html") try: try: snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) - response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True) + canonical_base = snapshot.url_path + if canonical_base != snapshot.legacy_archive_path: + target_path = f"/{canonical_base}/{archivefile or 'index.html'}" + query = request.META.get("QUERY_STRING") + if query: + target_path = f"{target_path}?{query}" + return redirect(target_path) + + if request.GET.get("files"): + target_path = _files_index_target(snapshot, archivefile) + response = serve_static_with_byterange_support( + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + elif archivefile == "index.html": + # if they requested snapshot index, serve live rendered template instead of static html + response = self.render_live_index(request, snapshot) + else: + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) response["Link"] = f'<{snapshot.url}>; rel="canonical"' return response except Snapshot.DoesNotExist: @@ -75,10 +321,10 @@ def get(self, request, path): return HttpResponse( format_html( ( - '



' - 'No Snapshot directories match the given timestamp or UUID: {}

' + "



" + "No Snapshot directories match the given timestamp/ID: {}

" 'You can add a new Snapshot, or return to the Main Index' - '
' + "
" ), slug, path, @@ -87,209 +333,1670 @@ def get(self, request, path): status=404, ) except Snapshot.MultipleObjectsReturned: - snapshot_hrefs = mark_safe('
').join( + snapshot_hrefs = mark_safe("
").join( format_html( - '{} {} {} {}', - snap.added.strftime('%Y-%m-%d %H:%M:%S'), - snap.timestamp, + '{} {} {} {}', + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), + snap.archive_path, snap.timestamp, snap.url, - snap.title or '', + snap.title_stripped[:64] or "", ) - for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added') + for snap in Snapshot.objects.filter(timestamp__startswith=slug) + .only("url", "timestamp", "title", "bookmarked_at") + .order_by("-bookmarked_at") ) return HttpResponse( format_html( - ( - 'Multiple Snapshots match the given timestamp/UUID {}
'
-                        ),
+                        ("Multiple Snapshots match the given timestamp/ID {}
"),
                         slug,
-                    ) + snapshot_hrefs + format_html(
-                        (
-                            '

' - 'Choose a Snapshot to proceed or go back to the Main Index' - ) - ), + ) + + snapshot_hrefs + + format_html('

Choose a Snapshot to proceed or go back to the Main Index'), content_type="text/html", status=404, ) except Http404: + assert snapshot # (Snapshot.DoesNotExist is already handled above) + # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png return HttpResponse( format_html( ( - '



' - f'Snapshot [{snapshot.timestamp}] exists in DB, but resource {snapshot.timestamp}/' - '{}' - f' does not exist in snapshot dir yet.

' - 'Maybe this resource type is not availabe for this Snapshot,
or the archiving process has not completed yet?
' - f'
# run this cmd to finish archiving this Snapshot
archivebox update -t timestamp {snapshot.timestamp}


' + "" + "Snapshot Not Found" + #'' + "" + "



" + f'Snapshot [{snapshot.timestamp}]: {snapshot.url}
' + f"was queued on {str(snapshot.bookmarked_at).split('.')[0]}, " + f'but no files have been saved yet in:
{snapshot.timestamp}/' + "{}" + f"

" + "It's possible {} " + f"during the last capture on {str(snapshot.bookmarked_at).split('.')[0]},
or that the archiving process has not completed yet.
" + f"
# run this cmd to finish/retry archiving this Snapshot
" + f'archivebox update -t timestamp {snapshot.timestamp}


' '
' - 'Next steps:
' - f'- list all the Snapshot files .*
' - f'- view the Snapshot ./index.html
' - f'- go to the Snapshot admin to edit
' - f'- go to the Snapshot actions to re-archive
' + "Next steps:
" + f'- list all the Snapshot files .*
' + f'- view the Snapshot ./index.html
' + f'- go to the Snapshot admin to edit
' + f'- go to the Snapshot actions to re-archive
' '- or return to the main index...
' - '
' + "
" + "" ), - archivefile, + archivefile if str(archivefile) != "None" else "", + f"the {archivefile} resource could not be fetched" + if str(archivefile) != "None" + else "the original site was not available", ), content_type="text/html", status=404, ) + # slug is a URL try: try: - # try exact match on full url first - snapshot = Snapshot.objects.get( - Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path) - ) + snapshot = SnapshotView.find_snapshots_for_url(path).get() except Snapshot.DoesNotExist: - # fall back to match on exact base_url - try: - snapshot = Snapshot.objects.get( - Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path)) - ) - except Snapshot.DoesNotExist: - # fall back to matching base_url as prefix - snapshot = Snapshot.objects.get( - Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) - ) - return redirect(f'/archive/{snapshot.timestamp}/index.html') + raise except Snapshot.DoesNotExist: return HttpResponse( format_html( ( - '



' - 'No Snapshots match the given url: {}


' + "



" + "No Snapshots match the given url: {}


" 'Return to the Main Index, or:

' '+ Add a new Snapshot for {}

' - '
' + "
" ), base_url(path), - path if '://' in path else f'https://{path}', + path if "://" in path else f"https://{path}", path, ), content_type="text/html", status=404, ) except Snapshot.MultipleObjectsReturned: - snapshot_hrefs = mark_safe('
').join( + snapshots = SnapshotView.find_snapshots_for_url(path) + snapshot_hrefs = mark_safe("
").join( format_html( - '{} {} {} {}', - snap.added.strftime('%Y-%m-%d %H:%M:%S'), - snap.timestamp, + '{} {} {} {} {}', + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), + str(snap.id)[:8], + snap.archive_path, snap.timestamp, snap.url, - snap.title or '', + snap.title_stripped[:64] or "", ) - for snap in Snapshot.objects.filter( - Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) - ).only('url', 'timestamp', 'title', 'added').order_by('-added') + for snap in snapshots.only("url", "timestamp", "title", "bookmarked_at").order_by("-bookmarked_at") ) return HttpResponse( format_html( - ( - 'Multiple Snapshots match the given URL {}
'
-                    ),
+                    ("Multiple Snapshots match the given URL {}
"),
                     base_url(path),
-                ) + snapshot_hrefs + format_html(
+                )
+                + snapshot_hrefs
+                + format_html('

Choose a Snapshot to proceed or go back to the Main Index'), + content_type="text/html", + status=404, + ) + + target_path = f"/{snapshot.archive_path}/index.html" + query = request.META.get("QUERY_STRING") + if query: + target_path = f"{target_path}?{query}" + return redirect(target_path) + + +class SnapshotPathView(View): + """Serve snapshots by the new URL scheme: /////...""" + + def get( + self, + request, + username: str, + date: str | None = None, + domain: str | None = None, + snapshot_id: str | None = None, + path: str = "", + url: str | None = None, + ): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + + if username == "system": + return redirect(request.path.replace("/system/", "/web/", 1)) + + if date and domain and domain == date: + raise Http404 + + requested_url = url + if not requested_url and domain and domain.startswith(("http://", "https://")): + requested_url = domain + + snapshot = None + if snapshot_id: + try: + snapshot = Snapshot.objects.get(pk=snapshot_id) + except Snapshot.DoesNotExist: + try: + snapshot = Snapshot.objects.get(id__startswith=snapshot_id) + except Snapshot.DoesNotExist: + snapshot = None + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first() + else: + # fuzzy lookup by date + domain/url (most recent) + username_lookup = "system" if username == "web" else username + if requested_url: + qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup) + else: + qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup) + + if date: + try: + if len(date) == 4: + qs = qs.filter(created_at__year=int(date)) + elif len(date) == 6: + qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6])) + elif len(date) == 8: + qs = qs.filter( + created_at__year=int(date[:4]), + created_at__month=int(date[4:6]), + created_at__day=int(date[6:8]), + ) + except ValueError: + pass + + if requested_url: + snapshot = qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first() + else: + requested_domain = domain or "" + if requested_domain.startswith(("http://", "https://")): + requested_domain = Snapshot.extract_domain_from_url(requested_domain) + else: + requested_domain = Snapshot.extract_domain_from_url(f"https://{requested_domain}") + + # Prefer exact domain matches + matches = [ + s for s in qs.order_by("-created_at", "-bookmarked_at") if Snapshot.extract_domain_from_url(s.url) == requested_domain + ] + snapshot = matches[0] if matches else qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first() + + if not snapshot: + return HttpResponse( + format_html( ( - '

' - 'Choose a Snapshot to proceed or go back to the Main Index' - ) + "



" + "No Snapshots match the given id or url: {}


" + 'Return to the Main Index' + "
" + ), + snapshot_id or requested_url or domain, ), content_type="text/html", status=404, ) - + + canonical_base = snapshot.url_path + if date: + requested_base = f"{username}/{date}/{domain or url or ''}" + else: + requested_base = f"{username}/{domain or url or ''}" + if snapshot_id: + requested_base = f"{requested_base}/{snapshot_id}" + if canonical_base != requested_base: + target = f"/{canonical_base}/{path or 'index.html'}" + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) + + archivefile = path or "index.html" + if archivefile != "index.html" and not request.GET.get("files"): + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) + + if request.GET.get("files"): + target_path = _files_index_target(snapshot, archivefile) + return serve_static_with_byterange_support( + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + + if archivefile == "index.html": + return SnapshotView.render_live_index(request, snapshot) + + return serve_static_with_byterange_support( + request, + archivefile, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + + +def _safe_archive_relpath(path: str) -> str | None: + if not path: + return "" + cleaned = posixpath.normpath(path) + cleaned = cleaned.lstrip("/") + if cleaned.startswith("..") or "/../" in f"/{cleaned}/": + return None + return cleaned + + +def _coerce_sort_timestamp(value: str | float | None) -> float: + if value is None: + return 0.0 + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _snapshot_sort_key(match_path: str, cache: dict[str, float]) -> tuple[float, str]: + parts = Path(match_path).parts + date_str = "" + snapshot_id = "" + try: + idx = parts.index("snapshots") + date_str = parts[idx + 1] + snapshot_id = parts[idx + 3] + except Exception: + return (_coerce_sort_timestamp(date_str), match_path) + + if snapshot_id not in cache: + snapshot = Snapshot.objects.filter(id=snapshot_id).only("bookmarked_at", "created_at", "downloaded_at", "timestamp").first() + if snapshot: + snap_dt = snapshot.bookmarked_at or snapshot.created_at or snapshot.downloaded_at + cache[snapshot_id] = snap_dt.timestamp() if snap_dt else _coerce_sort_timestamp(snapshot.timestamp) + else: + cache[snapshot_id] = _coerce_sort_timestamp(date_str) + + return (cache[snapshot_id], match_path) + + +def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None: + if not domain or not rel_path: + return None + domain = domain.split(":", 1)[0].lower() + # TODO: optimize by querying output_files in DB instead of globbing filesystem + data_root = DATA_DIR / "users" + escaped_domain = escape(domain) + escaped_path = escape(rel_path) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path) + matches = glob(pattern) + if not matches: + return None + + sort_cache: dict[str, float] = {} + best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache)) + best_path = Path(best) + parts = best_path.parts + try: + responses_idx = parts.index("responses") + except ValueError: + return None + responses_root = Path(*parts[: responses_idx + 1]) + rel_to_root = Path(*parts[responses_idx + 1 :]) + return responses_root, rel_to_root + + +def _latest_responses_root(domain: str) -> Path | None: + if not domain: + return None + domain = domain.split(":", 1)[0].lower() + data_root = DATA_DIR / "users" + escaped_domain = escape(domain) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain) + matches = glob(pattern) + if not matches: + return None + + sort_cache: dict[str, float] = {} + best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache)) + return Path(best) + + +def _latest_snapshot_for_domain(domain: str) -> Snapshot | None: + if not domain: + return None + + requested_domain = domain.split(":", 1)[0].lower() + snapshots = SnapshotView.find_snapshots_for_url(f"https://{requested_domain}").order_by("-created_at", "-bookmarked_at", "-timestamp") + for snapshot in snapshots: + if Snapshot.extract_domain_from_url(snapshot.url).lower() == requested_domain: + return snapshot + return None + + +def _original_request_url(domain: str, path: str = "", query_string: str = "") -> str: + normalized_domain = (domain or "").split(":", 1)[0].lower() + normalized_path = (path or "").lstrip("/") + if normalized_path in ("", "index.html"): + normalized_path = "" + target = f"https://{normalized_domain}" + if normalized_path: + target = f"{target}/{normalized_path}" + if query_string: + target = f"{target}?{query_string}" + return target + + +def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool): + candidates: list[str] = [] + rel_path = rel_path or "" + if rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + if "." not in Path(rel_path).name: + candidates.append(f"{rel_path.rstrip('/')}/index.html") + candidates.append(rel_path) + + for candidate in candidates: + try: + return serve_static_with_byterange_support( + request, + candidate, + document_root=str(responses_root), + show_indexes=show_indexes, + is_archive_replay=True, + ) + except Http404: + pass + + if rel_path.endswith("index.html"): + rel_dir = rel_path[: -len("index.html")] + try: + return serve_static_with_byterange_support( + request, + rel_dir, + document_root=str(responses_root), + show_indexes=True, + is_archive_replay=True, + ) + except Http404: + return None + return None + + +def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""): + rel_path = path or "" + is_directory_request = bool(path) and path.endswith("/") + show_indexes = bool(request.GET.get("files")) or (SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and is_directory_request) + if not show_indexes and (not rel_path or rel_path == "index.html"): + return SnapshotView.render_live_index(request, snapshot) + + if not rel_path or rel_path.endswith("/"): + if show_indexes: + rel_path = rel_path.rstrip("/") + else: + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + try: + return serve_static_with_byterange_support( + request, + rel_path, + document_root=snapshot.output_dir, + show_indexes=show_indexes, + is_archive_replay=True, + ) + except Http404: + pass + + host = urlparse(snapshot.url).hostname or snapshot.domain + responses_root = Path(snapshot.output_dir) / "responses" / host + if responses_root.exists(): + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + raise Http404 + + +def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""): + requested_root_index = path in ("", "index.html") or path.endswith("/") + rel_path = path or "" + if not rel_path or rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + domain = domain.lower() + match = _latest_response_match(domain, rel_path) + if not match and "." not in Path(rel_path).name: + index_path = f"{rel_path.rstrip('/')}/index.html" + match = _latest_response_match(domain, index_path) + if not match and "." not in Path(rel_path).name: + html_path = f"{rel_path}.html" + match = _latest_response_match(domain, html_path) + + show_indexes = bool(request.GET.get("files")) + if match: + responses_root, rel_to_root = match + response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes) + if response is not None: + return response + + responses_root = _latest_responses_root(domain) + if responses_root: + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + if requested_root_index and not show_indexes: + snapshot = _latest_snapshot_for_domain(domain) + if snapshot: + return SnapshotView.render_live_index(request, snapshot) + + if SERVER_CONFIG.PUBLIC_ADD_VIEW or request.user.is_authenticated: + target_url = _original_request_url(domain, path, request.META.get("QUERY_STRING", "")) + return redirect(build_web_url(f"/web/{quote(target_url, safe=':/')}")) + + raise Http404 + + +class SnapshotHostView(View): + """Serve snapshot directory contents on ./.""" + + def get(self, request, snapshot_id: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + snapshot = _find_snapshot_by_ref(snapshot_id) + + if not snapshot: + raise Http404 + + canonical_host = get_snapshot_host(str(snapshot.id)) + if not host_matches(request.get_host(), canonical_host): + target = build_snapshot_url(str(snapshot.id), path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return _serve_snapshot_replay(request, snapshot, path) + + +class SnapshotReplayView(View): + """Serve snapshot directory contents on a one-domain replay path.""" + + def get(self, request, snapshot_id: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + + snapshot = _find_snapshot_by_ref(snapshot_id) + if not snapshot: + raise Http404 + + return _serve_snapshot_replay(request, snapshot, path) + + +class OriginalDomainHostView(View): + """Serve responses from the most recent snapshot when using ./.""" + + def get(self, request, domain: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + return _serve_original_domain_replay(request, domain, path) + + +class OriginalDomainReplayView(View): + """Serve original-domain replay content on a one-domain replay path.""" + + def get(self, request, domain: str, path: str = ""): + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: + return _admin_login_redirect_or_forbidden(request) + return _serve_original_domain_replay(request, domain, path) + class PublicIndexView(ListView): - template_name = 'public_index.html' + template_name = "public_index.html" model = Snapshot - paginate_by = SNAPSHOTS_PER_PAGE - ordering = ['-added'] + paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + ordering = ["-bookmarked_at", "-created_at"] def get_context_data(self, **kwargs): return { **super().get_context_data(**kwargs), - 'VERSION': VERSION, - 'FOOTER_INFO': FOOTER_INFO, + "VERSION": VERSION, + "COMMIT_HASH": SHELL_CONFIG.COMMIT_HASH, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + "search_mode": get_search_mode(self.request.GET.get("search_mode")), } - def get_queryset(self, **kwargs): + def get_queryset(self, **kwargs): qs = super().get_queryset(**kwargs) - query = self.request.GET.get('q') - if query and query.strip(): - qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query)) - try: - qs = qs | query_search_index(query) - except Exception as err: - print(f'[!] Error while using search backend: {err.__class__.__name__} {err}') - return qs + query = self.request.GET.get("q", default="").strip() + + if not query: + return qs.distinct() + + query_type = self.request.GET.get("query_type") + search_mode = get_search_mode(self.request.GET.get("search_mode")) + + if not query_type or query_type == "all": + metadata_qs = qs.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + if search_mode == "meta": + qs = metadata_qs + else: + try: + qs = prioritize_metadata_matches( + qs, + metadata_qs, + query_search_index(query, search_mode=search_mode), + ordering=self.ordering, + ) + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + qs = metadata_qs + elif query_type == "fulltext": + if search_mode == "meta": + qs = qs.none() + else: + try: + qs = query_search_index(query, search_mode=search_mode).filter(pk__in=qs.values("pk")) + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + qs = qs.none() + elif query_type == "meta": + qs = qs.filter( + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), + ) + elif query_type == "url": + qs = qs.filter(Q(url__icontains=query)) + elif query_type == "title": + qs = qs.filter(Q(title__icontains=query)) + elif query_type == "timestamp": + qs = qs.filter(Q(timestamp__icontains=query)) + elif query_type == "tags": + qs = qs.filter(Q(tags__name__icontains=query)) + else: + print(f'[!] Unknown value for query_type: "{query_type}"') + + return qs.distinct() def get(self, *args, **kwargs): - if PUBLIC_INDEX or self.request.user.is_authenticated: + if self.request.user.is_authenticated: + return redirect("/admin/core/snapshot/") + if SERVER_CONFIG.PUBLIC_INDEX: response = super().get(*args, **kwargs) return response else: - return redirect(f'/admin/login/?next={self.request.path}') + return _admin_login_redirect_or_forbidden(self.request) +@method_decorator(csrf_exempt, name="dispatch") class AddView(UserPassesTestMixin, FormView): template_name = "add.html" form_class = AddLinkForm def get_initial(self): """Prefill the AddLinkForm with the 'url' GET parameter""" - if self.request.method == 'GET': - url = self.request.GET.get('url', None) + if self.request.method == "GET": + url = self.request.GET.get("url", None) if url: - return {'url': url if '://' in url else f'https://{url}'} - + return {"url": url if "://" in url else f"https://{url}"} + return super().get_initial() def test_func(self): - return PUBLIC_ADD_VIEW or self.request.user.is_authenticated + return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated + + def _can_override_crawl_config(self) -> bool: + user = self.request.user + return bool(user.is_authenticated and (getattr(user, "is_superuser", False) or getattr(user, "is_staff", False))) + + def _get_custom_config_overrides(self, form: AddLinkForm) -> dict: + custom_config = form.cleaned_data.get("config") or {} + + if not isinstance(custom_config, dict): + return {} + + if not self._can_override_crawl_config(): + return {} + + return custom_config def get_context_data(self, **kwargs): + required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip() + plugin_configs = discover_plugin_configs() + plugin_dependency_map = { + plugin_name: [ + str(required_plugin).strip() for required_plugin in (schema.get("required_plugins") or []) if str(required_plugin).strip() + ] + for plugin_name, schema in plugin_configs.items() + if isinstance(schema.get("required_plugins"), list) and schema.get("required_plugins") + } return { **super().get_context_data(**kwargs), - 'title': "Add URLs", + "title": "Create Crawl", # We can't just call request.build_absolute_uri in the template, because it would include query parameters - 'absolute_add_path': self.request.build_absolute_uri(self.request.path), - 'VERSION': VERSION, - 'FOOTER_INFO': FOOTER_INFO, - 'stdout': '', + "absolute_add_path": self.request.build_absolute_uri(self.request.path), + "VERSION": VERSION, + "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, + "required_search_plugin": required_search_plugin, + "plugin_dependency_map_json": json.dumps(plugin_dependency_map, sort_keys=True), + "stdout": "", } - def form_valid(self, form): - url = form.cleaned_data["url"] - print(f'[+] Adding URL: {url}') - parser = form.cleaned_data["parser"] + def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl: + urls = form.cleaned_data["url"] + print(f"[+] Adding URL: {urls}") + + # Extract all form fields tag = form.cleaned_data["tag"] - depth = 0 if form.cleaned_data["depth"] == "0" else 1 - extractors = ','.join(form.cleaned_data["archive_methods"]) - input_kwargs = { - "urls": url, - "tag": tag, - "depth": depth, - "parser": parser, - "update_all": False, - "out_dir": OUTPUT_DIR, + depth = int(form.cleaned_data["depth"]) + max_urls = int(form.cleaned_data.get("max_urls") or 0) + max_size = int(form.cleaned_data.get("max_size") or 0) + plugins = ",".join(form.cleaned_data.get("plugins", [])) + schedule = form.cleaned_data.get("schedule", "").strip() + persona = form.cleaned_data.get("persona") + index_only = form.cleaned_data.get("index_only", False) + notes = form.cleaned_data.get("notes", "") + url_filters = form.cleaned_data.get("url_filters") or {} + custom_config = self._get_custom_config_overrides(form) + + from archivebox.config.permissions import HOSTNAME + + if created_by_id is None: + if self.request.user.is_authenticated: + created_by_id = self.request.user.pk + else: + from archivebox.base_models.models import get_or_create_system_user_pk + + created_by_id = get_or_create_system_user_pk() + + created_by_name = getattr(self.request.user, "username", "web") if self.request.user.is_authenticated else "web" + + # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt + sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__web_ui_add_by_user_{created_by_id}.txt" + sources_file.parent.mkdir(parents=True, exist_ok=True) + sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls)) + + # 2. create a new Crawl with the URLs from the file + timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") + urls_content = sources_file.read_text() + # Build complete config + config = { + "INDEX_ONLY": index_only, + "DEPTH": depth, + "PLUGINS": plugins or "", + "DEFAULT_PERSONA": (persona.name if persona else "Default"), } - if extractors: - input_kwargs.update({"extractors": extractors}) - add_stdout = StringIO() - with redirect_stdout(add_stdout): - add(**input_kwargs) - print(add_stdout.getvalue()) - - context = self.get_context_data() - - context.update({ - "stdout": ansi_to_html(add_stdout.getvalue().strip()), - "form": AddLinkForm() - }) - return render(template_name=self.template_name, request=self.request, context=context) + + # Merge custom config overrides + config.update(custom_config) + if url_filters.get("allowlist"): + config["URL_ALLOWLIST"] = url_filters["allowlist"] + if url_filters.get("denylist"): + config["URL_DENYLIST"] = url_filters["denylist"] + + crawl = Crawl.objects.create( + urls=urls_content, + max_depth=depth, + max_urls=max_urls, + max_size=max_size, + tags_str=tag, + notes=notes, + label=f"{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}", + created_by_id=created_by_id, + config=config, + ) + + # 3. create a CrawlSchedule if schedule is provided + if schedule: + from archivebox.crawls.models import CrawlSchedule + + crawl_schedule = CrawlSchedule.objects.create( + template=crawl, + schedule=schedule, + is_enabled=True, + label=crawl.label, + notes=f"Auto-created from add page. {notes}".strip(), + created_by_id=created_by_id, + ) + crawl.schedule = crawl_schedule + crawl.save(update_fields=["schedule"]) + + crawl.create_snapshots_from_urls() + from archivebox.services.runner import ensure_background_runner + + ensure_background_runner() + + # 4. start the Orchestrator & wait until it completes + # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... + # from archivebox.crawls.actors import CrawlActor + # from archivebox.core.actors import SnapshotActor, ArchiveResultActor + + return crawl + + def form_valid(self, form): + crawl = self._create_crawl_from_form(form) + + urls = form.cleaned_data["url"] + schedule = form.cleaned_data.get("schedule", "").strip() + rough_url_count = len([url for url in urls.splitlines() if url.strip()]) + + # Build success message with schedule link if created + schedule_msg = "" + if schedule: + schedule_msg = f" and scheduled to repeat {schedule}" + + messages.success( + self.request, + mark_safe( + f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. View Crawl →", + ), + ) + + # Orchestrator (managed by supervisord) will pick up the queued crawl + return redirect(crawl.admin_change_url) + + +class WebAddView(AddView): + def _latest_snapshot_for_url(self, requested_url: str): + return SnapshotView.find_snapshots_for_url(requested_url).order_by("-created_at", "-bookmarked_at", "-timestamp").first() + + def _normalize_add_url(self, requested_url: str) -> str: + if requested_url.startswith(("http://", "https://")): + return requested_url + return f"https://{requested_url}" + + def dispatch(self, request, *args, **kwargs): + requested_url = urldecode(kwargs.get("url", "") or "") + if requested_url: + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f"/{snapshot.url_path}") + + if not self.test_func(): + request_host = (request.get_host() or "").lower() + if host_matches(request_host, get_web_host()): + return redirect(build_admin_url(request.get_full_path(), request=request)) + if host_matches(request_host, get_admin_host()): + next_url = quote(request.get_full_path(), safe="/:?=&") + return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={next_url}") + return HttpResponse( + format_html( + ( + "



" + "No Snapshots match the given url: {}


" + 'Return to the Main Index' + "
" + ), + requested_url or "", + ), + content_type="text/html", + status=404, + ) + + return super().dispatch(request, *args, **kwargs) + + def get(self, request: HttpRequest, *args: object, **kwargs: object): + requested_url = urldecode(str(kwargs.get("url") or (args[0] if args else ""))) + if not requested_url: + raise Http404 + + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f"/{snapshot.url_path}") + + add_url = self._normalize_add_url(requested_url) + assert self.form_class is not None + defaults_form = self.form_class() + form_data = { + "url": add_url, + "depth": defaults_form.fields["depth"].initial or "0", + "max_urls": defaults_form.fields["max_urls"].initial or 0, + "max_size": defaults_form.fields["max_size"].initial or "0", + "persona": defaults_form.fields["persona"].initial or "Default", + "config": {}, + } + if defaults_form.fields["index_only"].initial: + form_data["index_only"] = "on" + + form = self.form_class(data=form_data) + if not form.is_valid(): + return self.form_invalid(form) + + crawl = self._create_crawl_from_form(form) + snapshot = Snapshot.from_json({"url": add_url, "tags": form.cleaned_data.get("tag", "")}, overrides={"crawl": crawl}) + assert snapshot is not None + return redirect(f"/{snapshot.url_path}") + + +class HealthCheckView(View): + """ + A Django view that renders plain text "OK" for service discovery tools + """ + + def get(self, request): + """ + Handle a GET request + """ + return HttpResponse("OK", content_type="text/plain", status=200) + + +def live_progress_view(request): + """Simple JSON endpoint for live progress status - used by admin progress monitor.""" + try: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.machine.models import Process, Machine + + def is_current_run_timestamp(event_ts, run_started_at) -> bool: + if run_started_at is None: + return True + if event_ts is None: + return False + return event_ts >= run_started_at + + def archiveresult_matches_current_run(ar, run_started_at) -> bool: + if run_started_at is None: + return True + if ar.status in ( + ArchiveResult.StatusChoices.QUEUED, + ArchiveResult.StatusChoices.STARTED, + ArchiveResult.StatusChoices.BACKOFF, + ): + return True + event_ts = ar.end_ts or ar.start_ts or ar.modified_at or ar.created_at + return is_current_run_timestamp(event_ts, run_started_at) + + def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]: + normalized_hook_name = Path(hook_name).name if hook_name else "" + if not normalized_hook_name: + return (plugin, plugin, "unknown", "") + + phase = "unknown" + if normalized_hook_name == "InstallEvent": + phase = "install" + elif normalized_hook_name.startswith("on_CrawlSetup__"): + phase = "crawl" + elif normalized_hook_name.startswith("on_Snapshot__"): + phase = "snapshot" + elif normalized_hook_name.startswith("on_BinaryRequest__"): + phase = "binary" + + label = normalized_hook_name + if "__" in normalized_hook_name: + label = normalized_hook_name.split("__", 1)[1] + label = label.rsplit(".", 1)[0] + if len(label) > 3 and label[:2].isdigit() and label[2] == "_": + label = label[3:] + label = label.replace("_", " ").strip() or plugin + + return (plugin, label, phase, normalized_hook_name) + + def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]: + hook_path = "" + if isinstance(cmd, list) and cmd: + first = cmd[0] + if isinstance(first, str): + hook_path = first + + if not hook_path: + return ("", "setup", "unknown", "") + + return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup") + + machine = Machine.current() + Process.cleanup_stale_running(machine=machine) + Process.cleanup_orphaned_workers() + orchestrator_proc = ( + Process.objects.filter( + machine=machine, + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + ) + .order_by("-started_at") + .first() + ) + orchestrator_running = orchestrator_proc is not None + orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None + # Get model counts by status + crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count() + crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count() + + # Get recent crawls (last 24 hours) + from datetime import timedelta + + one_day_ago = timezone.now() - timedelta(days=1) + crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count() + + snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count() + snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count() + + archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count() + archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count() + archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count() + archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count() + + # Get recently completed ArchiveResults with thumbnails (last 20 succeeded results) + recent_thumbnails = [] + recent_results = ( + ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + .select_related("snapshot") + .order_by("-end_ts")[:20] + ) + + for ar in recent_results: + embed = ar.embed_path() + if embed: + # Only include results with embeddable image/media files + ext = embed.lower().split(".")[-1] if "." in embed else "" + is_embeddable = ext in ("png", "jpg", "jpeg", "gif", "webp", "svg", "ico", "pdf", "html") + if is_embeddable or ar.plugin in ("screenshot", "favicon", "dom"): + archive_path = embed or "" + recent_thumbnails.append( + { + "id": str(ar.id), + "plugin": ar.plugin, + "snapshot_id": str(ar.snapshot_id), + "snapshot_url": ar.snapshot.url[:60] if ar.snapshot else "", + "embed_path": embed, + "archive_path": archive_path, + "archive_url": build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else "", + "end_ts": ar.end_ts.isoformat() if ar.end_ts else None, + }, + ) + + # Build hierarchical active crawls with nested snapshots and archive results + + active_crawls_qs = ( + Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]) + .prefetch_related( + "snapshot_set", + "snapshot_set__archiveresult_set", + "snapshot_set__archiveresult_set__process", + ) + .distinct() + .order_by("-modified_at")[:10] + ) + + running_processes = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ) + recent_processes = Process.objects.filter( + machine=machine, + process_type__in=[ + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + modified_at__gte=timezone.now() - timedelta(minutes=10), + ).order_by("-modified_at") + crawl_process_pids: dict[str, int] = {} + snapshot_process_pids: dict[str, int] = {} + process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {} + process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {} + seen_process_records: set[str] = set() + snapshots = [snapshot for crawl in active_crawls_qs for snapshot in crawl.snapshot_set.all()] + for proc in running_processes: + if not proc.pwd: + continue + proc_pwd = Path(proc.pwd) + matched_snapshot = None + for snapshot in snapshots: + try: + proc_pwd.relative_to(snapshot.output_dir) + matched_snapshot = snapshot + break + except ValueError: + continue + if matched_snapshot is None: + continue + crawl_id = str(matched_snapshot.crawl_id) + snapshot_id = str(matched_snapshot.id) + _plugin, _label, phase, _hook_name = process_label(proc.cmd) + if crawl_id and proc.pid: + crawl_process_pids.setdefault(crawl_id, proc.pid) + if phase == "snapshot" and snapshot_id and proc.pid: + snapshot_process_pids.setdefault(snapshot_id, proc.pid) + + for proc in recent_processes: + if not proc.pwd: + continue + proc_pwd = Path(proc.pwd) + matched_snapshot = None + for snapshot in snapshots: + try: + proc_pwd.relative_to(snapshot.output_dir) + matched_snapshot = snapshot + break + except ValueError: + continue + if matched_snapshot is None: + continue + crawl_id = str(matched_snapshot.crawl_id) + snapshot_id = str(matched_snapshot.id) + + plugin, label, phase, hook_name = process_label(proc.cmd) + + record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id) + proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}" + if proc_key in seen_process_records: + continue + seen_process_records.add(proc_key) + + status = ( + "started" + if proc.status == Process.StatusChoices.RUNNING + else ("failed" if proc.exit_code not in (None, 0) else "succeeded") + ) + payload: dict[str, object] = { + "id": str(proc.id), + "plugin": plugin, + "label": label, + "hook_name": hook_name, + "status": status, + "phase": phase, + "source": "process", + "process_id": str(proc.id), + } + if status == "started" and proc.pid: + payload["pid"] = proc.pid + proc_started_at = proc.started_at or proc.modified_at + if phase == "snapshot" and snapshot_id: + process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at)) + elif crawl_id: + process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at)) + + active_crawls = [] + total_workers = 0 + for crawl in active_crawls_qs: + # Get ALL snapshots for this crawl to count status (already prefetched) + all_crawl_snapshots = list(crawl.snapshot_set.all()) + + # Count snapshots by status from ALL snapshots + total_snapshots = len(all_crawl_snapshots) + completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED) + started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED) + pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED) + + # Get only ACTIVE snapshots to display (limit to 5 most recent) + active_crawl_snapshots = [ + s for s in all_crawl_snapshots if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + ][:5] + + # Count URLs in the crawl (for when snapshots haven't been created yet) + urls_count = 0 + if crawl.urls: + urls_count = len([u for u in crawl.urls.split("\n") if u.strip() and not u.startswith("#")]) + + # Calculate crawl progress + crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 + crawl_run_started_at = crawl.created_at + crawl_setup_plugins = [ + payload + for payload, proc_started_at in process_records_by_crawl.get(str(crawl.id), []) + if is_current_run_timestamp(proc_started_at, crawl_run_started_at) + ] + total_workers += sum(1 for item in crawl_setup_plugins if item.get("source") == "process" and item.get("status") == "started") + crawl_setup_total = len(crawl_setup_plugins) + crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded") + crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed") + crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued") + + # Get active snapshots for this crawl (already prefetched) + active_snapshots_for_crawl = [] + for snapshot in active_crawl_snapshots: + snapshot_run_started_at = snapshot.downloaded_at or snapshot.created_at + # Get archive results for this snapshot (already prefetched) + snapshot_results = [ + ar for ar in snapshot.archiveresult_set.all() if archiveresult_matches_current_run(ar, snapshot_run_started_at) + ] + + now = timezone.now() + plugin_progress_values: list[int] = [] + all_plugins: list[dict[str, object]] = [] + seen_plugin_keys: set[str] = set() + + def plugin_sort_key(ar): + status_order = { + ArchiveResult.StatusChoices.STARTED: 0, + ArchiveResult.StatusChoices.QUEUED: 1, + ArchiveResult.StatusChoices.SUCCEEDED: 2, + ArchiveResult.StatusChoices.NORESULTS: 3, + ArchiveResult.StatusChoices.FAILED: 4, + } + return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "") + + for ar in sorted(snapshot_results, key=plugin_sort_key): + status = ar.status + progress_value = 0 + if status in ( + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ): + progress_value = 100 + elif status == ArchiveResult.StatusChoices.STARTED: + started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None) + timeout = ar.timeout or 120 + if started_at and timeout: + elapsed = max(0.0, (now - started_at).total_seconds()) + progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100))) + else: + progress_value = 1 + else: + progress_value = 0 + + plugin_progress_values.append(progress_value) + plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin) + + plugin_payload = { + "id": str(ar.id), + "plugin": ar.plugin, + "label": label, + "hook_name": hook_name, + "phase": phase, + "status": status, + "process_id": str(ar.process_id) if ar.process_id else None, + } + if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process: + plugin_payload["pid"] = ar.process.pid + if status == ArchiveResult.StatusChoices.STARTED: + plugin_payload["progress"] = progress_value + plugin_payload["timeout"] = ar.timeout or 120 + plugin_payload["source"] = "archiveresult" + all_plugins.append(plugin_payload) + seen_plugin_keys.add(str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}") + + for proc_payload, proc_started_at in process_records_by_snapshot.get(str(snapshot.id), []): + if not is_current_run_timestamp(proc_started_at, snapshot_run_started_at): + continue + proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}") + if proc_key in seen_plugin_keys: + continue + seen_plugin_keys.add(proc_key) + all_plugins.append(proc_payload) + + proc_status = proc_payload.get("status") + if proc_status in ("succeeded", "failed", "skipped"): + plugin_progress_values.append(100) + elif proc_status == "started": + plugin_progress_values.append(1) + total_workers += 1 + else: + plugin_progress_values.append(0) + + total_plugins = len(all_plugins) + completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded") + failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed") + pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued") + + snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0 + + active_snapshots_for_crawl.append( + { + "id": str(snapshot.id), + "url": snapshot.url[:80], + "status": snapshot.status, + "started": (snapshot.downloaded_at or snapshot.created_at).isoformat() + if (snapshot.downloaded_at or snapshot.created_at) + else None, + "progress": snapshot_progress, + "total_plugins": total_plugins, + "completed_plugins": completed_plugins, + "failed_plugins": failed_plugins, + "pending_plugins": pending_plugins, + "all_plugins": all_plugins, + "worker_pid": snapshot_process_pids.get(str(snapshot.id)), + }, + ) + + # Check if crawl can start (for debugging stuck crawls) + can_start = bool(crawl.urls) + urls_preview = crawl.urls[:60] if crawl.urls else None + + # Check if retry_at is in the future (would prevent worker from claiming) + retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False + seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0 + + active_crawls.append( + { + "id": str(crawl.id), + "label": str(crawl)[:60], + "status": crawl.status, + "started": crawl.created_at.isoformat() if crawl.created_at else None, + "progress": crawl_progress, + "max_depth": crawl.max_depth, + "urls_count": urls_count, + "total_snapshots": total_snapshots, + "completed_snapshots": completed_snapshots, + "started_snapshots": started_snapshots, + "failed_snapshots": 0, + "pending_snapshots": pending_snapshots, + "setup_plugins": crawl_setup_plugins, + "setup_total_plugins": crawl_setup_total, + "setup_completed_plugins": crawl_setup_completed, + "setup_failed_plugins": crawl_setup_failed, + "setup_pending_plugins": crawl_setup_pending, + "active_snapshots": active_snapshots_for_crawl, + "can_start": can_start, + "urls_preview": urls_preview, + "retry_at_future": retry_at_future, + "seconds_until_retry": seconds_until_retry, + "worker_pid": crawl_process_pids.get(str(crawl.id)), + }, + ) + + return JsonResponse( + { + "orchestrator_running": orchestrator_running, + "orchestrator_pid": orchestrator_pid, + "total_workers": total_workers, + "crawls_pending": crawls_pending, + "crawls_started": crawls_started, + "crawls_recent": crawls_recent, + "snapshots_pending": snapshots_pending, + "snapshots_started": snapshots_started, + "archiveresults_pending": archiveresults_pending, + "archiveresults_started": archiveresults_started, + "archiveresults_succeeded": archiveresults_succeeded, + "archiveresults_failed": archiveresults_failed, + "active_crawls": active_crawls, + "recent_thumbnails": recent_thumbnails, + "server_time": timezone.now().isoformat(), + }, + ) + except Exception as e: + import traceback + + return JsonResponse( + { + "error": str(e), + "traceback": traceback.format_exc(), + "orchestrator_running": False, + "total_workers": 0, + "crawls_pending": 0, + "crawls_started": 0, + "crawls_recent": 0, + "snapshots_pending": 0, + "snapshots_started": 0, + "archiveresults_pending": 0, + "archiveresults_started": 0, + "archiveresults_succeeded": 0, + "archiveresults_failed": 0, + "active_crawls": [], + "recent_thumbnails": [], + "server_time": timezone.now().isoformat(), + }, + status=500, + ) + + +def find_config_section(key: str) -> str: + CONFIGS = get_all_configs() + + if key in CONSTANTS_CONFIG: + return "CONSTANT" + matching_sections = [section_id for section_id, section in CONFIGS.items() if key in dict(section)] + section = matching_sections[0] if matching_sections else "DYNAMIC" + return section + + +def find_config_default(key: str) -> str: + CONFIGS = get_all_configs() + + if key in CONSTANTS_CONFIG: + return str(CONSTANTS_CONFIG[key]) + + default_val = None + + for config in CONFIGS.values(): + if key in dict(config): + default_field = getattr(config, "model_fields", dict(config))[key] + default_val = default_field.default if hasattr(default_field, "default") else default_field + break + + if isinstance(default_val, Callable): + default_val = inspect.getsource(default_val).split("lambda", 1)[-1].split(":", 1)[-1].replace("\n", " ").strip() + if default_val.count(")") > default_val.count("("): + default_val = default_val[:-1] + else: + default_val = str(default_val) + + return default_val + + +def find_config_type(key: str) -> str: + from typing import ClassVar + + CONFIGS = get_all_configs() + + for config in CONFIGS.values(): + if hasattr(config, key): + # Try to get from pydantic model_fields first (more reliable) + if hasattr(config, "model_fields") and key in config.model_fields: + field = config.model_fields[key] + if hasattr(field, "annotation") and field.annotation is not None: + try: + return str(field.annotation.__name__) + except AttributeError: + return str(field.annotation) + + # Fallback to get_type_hints with proper namespace + try: + import typing + + namespace = { + "ClassVar": ClassVar, + "Optional": typing.Optional, + "Union": typing.Union, + "List": list, + "Dict": dict, + "Path": Path, + } + type_hints = get_type_hints(config, globalns=namespace, localns=namespace) + try: + return str(type_hints[key].__name__) + except AttributeError: + return str(type_hints[key]) + except Exception: + # If all else fails, return str + pass + return "str" + + +def key_is_safe(key: str) -> bool: + for term in ("key", "password", "secret", "token"): + if term in key.lower(): + return False + return True + + +def find_config_source(key: str, merged_config: dict) -> str: + """Determine where a config value comes from.""" + from archivebox.machine.models import Machine + + # Environment variables override all persistent config sources. + if key in os.environ: + return "Environment" + + # Machine.config overrides ArchiveBox.conf. + try: + machine = Machine.current() + if machine.config and key in machine.config: + return "Machine" + except Exception: + pass + + # Check if it's from archivebox.config.file + from archivebox.config.configset import BaseConfigSet + + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + return "Config File" + + # Otherwise it's using the default + return "Default" + + +def find_plugin_for_config_key(key: str) -> str | None: + for plugin_name, schema in discover_plugin_configs().items(): + if key in (schema.get("properties") or {}): + return plugin_name + return None + + +def get_config_definition_link(key: str) -> tuple[str, str]: + plugin_name = find_plugin_for_config_key(key) + if not plugin_name: + return ( + f"https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code", + "archivebox/config", + ) + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return ( + f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json", + f"abx_plugins/plugins/{plugin_name}/config.json", + ) + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return ( + f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/", + f"data/custom_plugins/{plugin_name}/config.json", + ) + + return ( + f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/", + f"abx_plugins/plugins/{plugin_name}/config.json", + ) + + +@render_with_table_view +def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: + CONFIGS = get_all_configs() + + assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings." + + # Get merged config that includes Machine.config overrides + try: + from archivebox.machine.models import Machine + + Machine.current() + merged_config = get_config() + except Exception: + # Fallback if Machine model not available + merged_config = get_config() + + rows = { + "Section": [], + "Key": [], + "Type": [], + "Value": [], + "Source": [], + "Default": [], + # "Documentation": [], + # "Aliases": [], + } + + for section_id, section in reversed(list(CONFIGS.items())): + for key in dict(section).keys(): + rows["Section"].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", find_config_type(key))) + + # Use merged config value (includes machine overrides) + actual_value = merged_config.get(key, getattr(section, key, None)) + rows["Value"].append(mark_safe(f"{actual_value}") if key_is_safe(key) else "******** (redacted)") + + # Show where the value comes from + source = find_config_source(key, merged_config) + source_colors = {"Machine": "purple", "Environment": "blue", "Config File": "green", "Default": "gray"} + rows["Source"].append(format_html('{}', source_colors.get(source, "gray"), source)) + + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) + # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) + # rows['Aliases'].append(', '.join(find_config_aliases(key))) + + section = "CONSTANT" + for key in CONSTANTS_CONFIG.keys(): + rows["Section"].append(section) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", getattr(type(CONSTANTS_CONFIG[key]), "__name__", str(CONSTANTS_CONFIG[key])))) + rows["Value"].append(format_html("{}", CONSTANTS_CONFIG[key]) if key_is_safe(key) else "******** (redacted)") + rows["Source"].append(mark_safe('Constant')) + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) + # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) + # rows['Aliases'].append('') + + return TableContext( + title="Computed Configuration Values", + table=rows, + ) + + +@render_with_item_view +def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + from archivebox.machine.models import Machine + from archivebox.config.configset import BaseConfigSet + + CONFIGS = get_all_configs() + FLAT_CONFIG = get_flat_config() + + assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings." + + # Get merged config + merged_config = get_config() + + # Determine all sources for this config value + sources_info = [] + + # Environment variable + if key in os.environ: + sources_info.append(("Environment", os.environ[key] if key_is_safe(key) else "********", "blue")) + + # Machine config + machine = None + machine_admin_url = None + try: + machine = Machine.current() + machine_admin_url = f"/admin/machine/machine/{machine.id}/change/" + if machine.config and key in machine.config: + sources_info.append(("Machine", machine.config[key] if key_is_safe(key) else "********", "purple")) + except Exception: + pass + + # Config file value + if CONSTANTS.CONFIG_FILE.exists(): + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + sources_info.append(("Config File", file_config[key], "green")) + + # Default value + default_val = find_config_default(key) + if default_val: + sources_info.append(("Default", default_val, "gray")) + + # Final computed value + final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None))) + if not key_is_safe(key): + final_value = "********" + + # Build sources display + sources_html = "
".join([f'{source}: {value}' for source, value, color in sources_info]) + + # aliases = USER_CONFIG.get(key, {}).get("aliases", []) + aliases = [] + + if key in CONSTANTS_CONFIG: + section_header = mark_safe( + f'[CONSTANTS]   {key}   (read-only, hardcoded by ArchiveBox)', + ) + elif key in FLAT_CONFIG: + section_header = mark_safe( + f'data / ArchiveBox.conf   [{find_config_section(key)}]   {key}', + ) + else: + section_header = mark_safe( + f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)', + ) + + definition_url, definition_label = get_config_definition_link(key) + + section_data = cast( + SectionData, + { + "name": section_header, + "description": None, + "fields": { + "Key": key, + "Type": find_config_type(key), + "Value": final_value, + "Currently read from": find_config_source(key, merged_config), + }, + "help_texts": { + "Key": mark_safe(f""" + Documentation   + + Aliases: {", ".join(aliases)} + + """), + "Type": mark_safe(f''' + + See full definition in {definition_label}... + + '''), + "Value": mark_safe(f''' + { + 'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' + if not key_is_safe(key) + else "" + } +


+ Configuration Sources (highest priority first):

+ {sources_html} +

+

+ To change this value, edit data/ArchiveBox.conf or run: +

+ archivebox config --set {key}="{ + val.strip("'") + if (val := find_config_default(key)) + else (str(FLAT_CONFIG[key] if key_is_safe(key) else "********")).strip("'") + }" +

+ '''), + "Currently read from": mark_safe(f""" + The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source. +

+ Priority order (highest to lowest): +
    +
  1. Environment - Environment variables
  2. +
  3. Machine - Machine-specific overrides + {f'
    → Edit {key} in Machine.config for this server' if machine_admin_url else ""} +
  4. +
  5. Config File - data/ArchiveBox.conf
  6. +
  7. Default - Default value from code
  8. +
+ {f'
Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ""} + """), + }, + }, + ) + + return ItemContext( + slug=key, + title=key, + data=[section_data], + ) diff --git a/archivebox/core/welcome_message.py b/archivebox/core/welcome_message.py deleted file mode 100644 index ed5d2d7719..0000000000 --- a/archivebox/core/welcome_message.py +++ /dev/null @@ -1,5 +0,0 @@ -from archivebox.logging_util import log_shell_welcome_msg - - -if __name__ == '__main__': - log_shell_welcome_msg() diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py new file mode 100644 index 0000000000..e7d43e0f8e --- /dev/null +++ b/archivebox/core/widgets.py @@ -0,0 +1,671 @@ +__package__ = "archivebox.core" + +import json +import re +import hashlib +from django import forms +from django.utils.html import escape +from django.utils.safestring import mark_safe + + +class TagEditorWidget(forms.Widget): + """ + A widget that renders tags as clickable pills with inline editing. + - Displays existing tags alphabetically as styled pills with X remove button + - Text input with HTML5 datalist for autocomplete suggestions + - Press Enter or Space to create new tags (auto-creates if doesn't exist) + - Uses AJAX for autocomplete and tag creation + """ + + template_name = "" # We render manually + + class Media: + css = {"all": []} + js = [] + + def __init__(self, attrs=None, snapshot_id=None): + self.snapshot_id = snapshot_id + super().__init__(attrs) + + def _escape(self, value): + """Escape HTML entities in value.""" + return escape(str(value)) if value else "" + + def _normalize_id(self, value): + """Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start).""" + normalized = re.sub(r"[^A-Za-z0-9_]", "_", str(value)) + if not normalized or not re.match(r"[A-Za-z_]", normalized): + normalized = f"t_{normalized}" + return normalized + + def _tag_style(self, value): + """Compute a stable pastel color style for a tag value.""" + tag = (value or "").strip().lower() + digest = hashlib.md5(tag.encode("utf-8")).hexdigest() + hue = int(digest[:4], 16) % 360 + bg = f"hsl({hue}, 70%, 92%)" + border = f"hsl({hue}, 60%, 82%)" + fg = f"hsl({hue}, 35%, 28%)" + return f"--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};" + + def render(self, name, value, attrs=None, renderer=None): + """ + Render the tag editor widget. + + Args: + name: Field name + value: Can be: + - QuerySet of Tag objects (from M2M field) + - List of tag names + - Comma-separated string of tag names + - None + attrs: HTML attributes + renderer: Not used + """ + # Parse value to get list of tag names + tags = [] + if value: + if hasattr(value, "all"): # QuerySet + tags = sorted([tag.name for tag in value.all()]) + elif isinstance(value, (list, tuple)): + if value and hasattr(value[0], "name"): # List of Tag objects + tags = sorted([tag.name for tag in value]) + else: # List of strings or IDs + # Could be tag IDs from form submission + from archivebox.core.models import Tag + + tag_names = [] + for v in value: + if isinstance(v, str) and not v.isdigit(): + tag_names.append(v) + else: + try: + tag = Tag.objects.get(pk=v) + tag_names.append(tag.name) + except (Tag.DoesNotExist, ValueError): + if isinstance(v, str): + tag_names.append(v) + tags = sorted(tag_names) + elif isinstance(value, str): + tags = sorted([t.strip() for t in value.split(",") if t.strip()]) + + widget_id_raw = attrs.get("id", name) if attrs else name + widget_id = self._normalize_id(widget_id_raw) + + # Build pills HTML + pills_html = "" + for tag in tags: + pills_html += f''' + + {self._escape(tag)} + + + ''' + + # Build the widget HTML + html = f''' +
+
+ {pills_html} +
+ + + +
+ + + ''' + + return mark_safe(html) + + +class URLFiltersWidget(forms.Widget): + """Render URL allowlist / denylist controls with same-domain autofill.""" + + template_name = "" + + def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'): + self.source_selector = source_selector + super().__init__(attrs) + + def render(self, name, value, attrs=None, renderer=None): + value = value if isinstance(value, dict) else {} + widget_id_raw = attrs.get("id", name) if attrs else name + widget_id = re.sub(r"[^A-Za-z0-9_]", "_", str(widget_id_raw)) or name + allowlist = escape(value.get("allowlist", "") or "") + denylist = escape(value.get("denylist", "") or "") + + return mark_safe(f''' +
+ +
+
+
+ + Regex patterns or domains to exclude, one pattern per line. +
+ +
+
+
+ + Regex patterns or domains to exclude, one pattern per line. +
+ +
+
+ +
These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.
+ +
+ ''') + + def value_from_datadict(self, data, files, name): + return { + "allowlist": data.get(f"{name}_allowlist", ""), + "denylist": data.get(f"{name}_denylist", ""), + "same_domain_only": data.get(f"{name}_same_domain_only") in ("1", "on", "true"), + } + + +class InlineTagEditorWidget(TagEditorWidget): + """ + Inline version of TagEditorWidget for use in list views. + Includes AJAX save functionality for immediate persistence. + """ + + def __init__(self, attrs=None, snapshot_id=None, editable=True): + super().__init__(attrs, snapshot_id) + self.snapshot_id = snapshot_id + self.editable = editable + + def render(self, name, value, attrs=None, renderer=None, snapshot_id=None): + """Render inline tag editor with AJAX save.""" + # Use snapshot_id from __init__ or from render call + snapshot_id = snapshot_id or self.snapshot_id + + # Parse value to get list of tag dicts with id and name + tag_data = [] + if value: + if hasattr(value, "all"): # QuerySet + for tag in value.all(): + tag_data.append({"id": tag.pk, "name": tag.name}) + tag_data.sort(key=lambda x: x["name"].lower()) + elif isinstance(value, (list, tuple)): + if value and hasattr(value[0], "name"): + for tag in value: + tag_data.append({"id": tag.pk, "name": tag.name}) + tag_data.sort(key=lambda x: x["name"].lower()) + + widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get("id", name) if attrs else name) + widget_id = self._normalize_id(widget_id_raw) + + # Build pills HTML with filter links + pills_html = "" + for td in tag_data: + remove_button = "" + if self.editable: + remove_button = ( + f'' + ) + pills_html += f''' + + {self._escape(td["name"])} + {remove_button} + + ''' + + tags_json = escape(json.dumps(tag_data)) + input_html = "" + readonly_class = " readonly" if not self.editable else "" + if self.editable: + input_html = f''' + + + ''' + + html = f''' + + + {pills_html} + + {input_html} + + ''' + + return mark_safe(html) diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py index 94993b92fe..1b667177a3 100644 --- a/archivebox/core/wsgi.py +++ b/archivebox/core/wsgi.py @@ -7,10 +7,10 @@ https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ """ +import archivebox # noqa +from archivebox.config.django import setup_django +from django.core.wsgi import get_wsgi_application -from archivebox.config import setup_django setup_django(in_memory_db=False, check_db=True) -from django.core.wsgi import get_wsgi_application - application = get_wsgi_application() diff --git a/archivebox/crawls/__init__.py b/archivebox/crawls/__init__.py new file mode 100644 index 0000000000..b47f54cadd --- /dev/null +++ b/archivebox/crawls/__init__.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.crawls" +__order__ = 100 + + +def register_admin(admin_site): + from .admin import register_admin as register_crawls_admin + + register_crawls_admin(admin_site) diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py new file mode 100644 index 0000000000..31c535fd48 --- /dev/null +++ b/archivebox/crawls/admin.py @@ -0,0 +1,832 @@ +__package__ = "archivebox.crawls" + +from django import forms +from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed +from django.shortcuts import get_object_or_404, redirect +from django.urls import path, reverse +from django.utils.html import escape, format_html, format_html_join +from django.utils import timezone +from django.utils.safestring import mark_safe +from django.contrib import admin, messages +from django.db.models import Count, Q + + +from django_object_actions import action + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin + +from archivebox.core.models import Snapshot +from archivebox.core.widgets import TagEditorWidget +from archivebox.crawls.models import Crawl, CrawlSchedule + + +def render_snapshots_list(snapshots_qs, limit=20, crawl=None): + """Render a nice inline list view of snapshots with status, title, URL, and progress.""" + + snapshots = snapshots_qs.order_by("-created_at")[:limit].annotate( + total_results=Count("archiveresult"), + succeeded_results=Count("archiveresult", filter=Q(archiveresult__status="succeeded")), + failed_results=Count("archiveresult", filter=Q(archiveresult__status="failed")), + started_results=Count("archiveresult", filter=Q(archiveresult__status="started")), + skipped_results=Count("archiveresult", filter=Q(archiveresult__status="skipped")), + ) + + if not snapshots: + return mark_safe('
No Snapshots yet...
') + + # Status colors matching Django admin and progress monitor + status_colors = { + "queued": ("#6c757d", "#f8f9fa"), # gray + "started": ("#856404", "#fff3cd"), # amber + "sealed": ("#155724", "#d4edda"), # green + "failed": ("#721c24", "#f8d7da"), # red + } + + rows = [] + for snapshot in snapshots: + status = snapshot.status or "queued" + color, bg = status_colors.get(status, ("#6c757d", "#f8f9fa")) + + # Calculate progress + total = snapshot.total_results + succeeded = snapshot.succeeded_results + failed = snapshot.failed_results + running = snapshot.started_results + skipped = snapshot.skipped_results + done = succeeded + failed + skipped + pending = max(total - done - running, 0) + progress_pct = int((done / total) * 100) if total > 0 else 0 + progress_text = f"{done}/{total}" if total > 0 else "-" + progress_title = f"{succeeded} succeeded, {failed} failed, {running} running, {pending} pending, {skipped} skipped" + progress_color = "#28a745" + if failed: + progress_color = "#dc3545" + elif running: + progress_color = "#17a2b8" + elif pending: + progress_color = "#ffc107" + + # Truncate title and URL + snapshot_title = snapshot.title or "Untitled" + title = snapshot_title[:60] + if len(snapshot_title) > 60: + title += "..." + url_display = snapshot.url[:50] + if len(snapshot.url) > 50: + url_display += "..." + delete_button = "" + exclude_button = "" + if crawl is not None: + delete_url = reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]) + exclude_url = reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, snapshot.pk]) + delete_button = f''' + + ''' + exclude_button = f''' + + ''' + + # Format date + date_str = snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot.created_at else "-" + + rows.append(f''' + + + {status} + + + + + + + + {escape(title)} + + + {escape(url_display)} + + +
+
+
+
+ {progress_text} +
+ + + {date_str} + + {f'
{exclude_button}{delete_button}
' if crawl is not None else ""} + + ''') + + total_count = snapshots_qs.count() + footer = "" + if total_count > limit: + footer = f""" + + + Showing {limit} of {total_count} snapshots + + + """ + + return mark_safe(f""" +
+ + + + + + + + + + { + '' if crawl is not None else "" + } + + + + {"".join(rows)} + {footer} + +
StatusTitleURLProgressCreatedActions
+
+ { + ''' + + ''' + if crawl is not None + else "" + } + """) + + +class URLFiltersWidget(forms.Widget): + def render(self, name, value, attrs=None, renderer=None): + value = value if isinstance(value, dict) else {} + widget_id = (attrs or {}).get("id", name) + allowlist = escape(value.get("allowlist", "") or "") + denylist = escape(value.get("denylist", "") or "") + + return mark_safe(f''' +
+ +
+
+ + +
+
+ + +
+
+ +

+ Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist. +

+ +
+ ''') + + def value_from_datadict(self, data, files, name): + return { + "allowlist": data.get(f"{name}_allowlist", ""), + "denylist": data.get(f"{name}_denylist", ""), + "same_domain_only": data.get(f"{name}_same_domain_only") in ("1", "on", "true"), + } + + +class URLFiltersField(forms.Field): + widget = URLFiltersWidget + + def to_python(self, value): + if isinstance(value, dict): + return value + return {"allowlist": "", "denylist": "", "same_domain_only": False} + + +class CrawlAdminForm(forms.ModelForm): + """Custom form for Crawl admin to render urls field as textarea.""" + + tags_editor = forms.CharField( + label="Tags", + required=False, + widget=TagEditorWidget(), + help_text="Type tag names and press Enter or Space to add. Click × to remove.", + ) + url_filters = URLFiltersField( + label="URL Filters", + required=False, + help_text="Set URL_ALLOWLIST / URL_DENYLIST for this crawl.", + ) + + class Meta: + model = Crawl + fields = "__all__" + widgets = { + "urls": forms.Textarea( + attrs={ + "rows": 8, + "style": "width: 100%; font-family: monospace; font-size: 13px;", + "placeholder": "https://example.com\nhttps://example2.com\n# Comments start with #", + }, + ), + "notes": forms.Textarea( + attrs={ + "rows": 1, + "style": "width: 100%; min-height: 0; resize: vertical;", + }, + ), + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {} + if self.instance and self.instance.pk: + self.initial["tags_editor"] = self.instance.tags_str + self.initial["url_filters"] = { + "allowlist": config.get("URL_ALLOWLIST", ""), + "denylist": config.get("URL_DENYLIST", ""), + "same_domain_only": False, + } + + def clean_tags_editor(self): + tags_str = self.cleaned_data.get("tags_editor", "") + tag_names = [] + seen = set() + for raw_name in tags_str.split(","): + name = raw_name.strip() + if not name: + continue + lowered = name.lower() + if lowered in seen: + continue + seen.add(lowered) + tag_names.append(name) + return ",".join(tag_names) + + def clean_url_filters(self): + value = self.cleaned_data.get("url_filters") or {} + return { + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), + } + + def save(self, commit=True): + instance = super().save(commit=False) + instance.tags_str = self.cleaned_data.get("tags_editor", "") + url_filters = self.cleaned_data.get("url_filters") or {} + instance.set_url_filters( + url_filters.get("allowlist", ""), + url_filters.get("denylist", ""), + ) + if commit: + instance.save() + instance.apply_crawl_config_filters() + save_m2m = getattr(self, "_save_m2m", None) + if callable(save_m2m): + save_m2m() + return instance + + +class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): + form = CrawlAdminForm + list_display = ( + "id", + "created_at", + "created_by", + "max_depth", + "max_urls", + "max_size", + "label", + "notes", + "urls_preview", + "schedule_str", + "status", + "retry_at", + "health_display", + "num_snapshots", + ) + sort_fields = ( + "id", + "created_at", + "created_by", + "max_depth", + "max_urls", + "max_size", + "label", + "notes", + "schedule_str", + "status", + "retry_at", + ) + search_fields = ("id", "created_by__username", "max_depth", "max_urls", "max_size", "label", "notes", "schedule_id", "status", "urls") + + readonly_fields = ("created_at", "modified_at", "snapshots") + + fieldsets = ( + ( + "URLs", + { + "fields": ("urls",), + "classes": ("card", "wide"), + }, + ), + ( + "Info", + { + "fields": ("label", "notes", "tags_editor"), + "classes": ("card",), + }, + ), + ( + "Settings", + { + "fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("schedule", "created_by"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), + ) + add_fieldsets = ( + ( + "URLs", + { + "fields": ("urls",), + "classes": ("card", "wide"), + }, + ), + ( + "Info", + { + "fields": ("label", "notes", "tags_editor"), + "classes": ("card",), + }, + ), + ( + "Settings", + { + "fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("schedule", "created_by"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("max_depth", "max_urls", "schedule", "created_by", "status", "retry_at") + ordering = ["-created_at", "-retry_at"] + list_per_page = 100 + actions = ["delete_selected_batched"] + change_actions = ["recrawl"] + + def get_queryset(self, request): + """Optimize queries with select_related and annotations.""" + qs = super().get_queryset(request) + return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set")) + + def get_fieldsets(self, request, obj=None): + return self.fieldsets if obj else self.add_fieldsets + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path( + "/snapshot//delete/", + self.admin_site.admin_view(self.delete_snapshot_view), + name="crawls_crawl_snapshot_delete", + ), + path( + "/snapshot//exclude-domain/", + self.admin_site.admin_view(self.exclude_domain_view), + name="crawls_crawl_snapshot_exclude_domain", + ), + ] + return custom_urls + urls + + @admin.action(description="Delete selected crawls") + def delete_selected_batched(self, request, queryset): + """Delete crawls in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list("pk", flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete() + + messages.success(request, f"Successfully deleted {total} crawls ({deleted_count} total objects including related records).") + + @action(label="Recrawl", description="Create a new crawl with the same settings") + def recrawl(self, request, obj): + """Duplicate this crawl as a new crawl with the same URLs and settings.""" + + # Validate URLs (required for crawl to start) + if not obj.urls: + messages.error(request, "Cannot recrawl: original crawl has no URLs.") + return redirect("admin:crawls_crawl_change", obj.id) + + new_crawl = Crawl.objects.create( + urls=obj.urls, + max_depth=obj.max_depth, + max_urls=obj.max_urls, + max_size=obj.max_size, + tags_str=obj.tags_str, + config=obj.config, + schedule=obj.schedule, + label=f"{obj.label} (recrawl)" if obj.label else "", + notes=obj.notes, + created_by=request.user, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + messages.success(request, f"Created new crawl {new_crawl.id} with the same settings. It will start processing shortly.") + + return redirect("admin:crawls_crawl_change", new_crawl.id) + + def num_snapshots(self, obj): + # Use cached annotation from get_queryset to avoid N+1 + return getattr(obj, "num_snapshots_cached", obj.snapshot_set.count()) + + def snapshots(self, obj): + return render_snapshots_list(obj.snapshot_set.all(), crawl=obj) + + def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + + if snapshot.status == Snapshot.StatusChoices.STARTED: + snapshot.cancel_running_hooks() + + removed_urls = crawl.prune_url(snapshot.url) + snapshot.delete() + return JsonResponse( + { + "ok": True, + "snapshot_id": str(snapshot.id), + "removed_urls": removed_urls, + }, + ) + + def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + result = crawl.exclude_domain(snapshot.url) + return JsonResponse( + { + "ok": True, + **result, + }, + ) + + @admin.display(description="Schedule", ordering="schedule") + def schedule_str(self, obj): + if not obj.schedule: + return mark_safe("None") + return format_html('{}', obj.schedule.admin_change_url, obj.schedule) + + @admin.display(description="URLs", ordering="urls") + def urls_preview(self, obj): + first_url = obj.get_urls_list()[0] if obj.get_urls_list() else "" + return first_url[:80] + "..." if len(first_url) > 80 else first_url + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('{}', color, h) + + @admin.display(description="URLs") + def urls_editor(self, obj): + """Editor for crawl URLs.""" + widget_id = f"crawl_urls_{obj.pk}" + + # Escape for safe HTML embedding + escaped_urls = (obj.urls or "").replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) + + # Count lines for auto-expand logic + line_count = len((obj.urls or "").split("\n")) + uri_rows = min(max(3, line_count), 10) + + html = f''' +
+ +
+ + +

+ {line_count} URL{"s" if line_count != 1 else ""} ¡ Note: URLs displayed here for reference only +

+
+
+ ''' + return mark_safe(html) + + +class CrawlScheduleAdmin(BaseModelAdmin): + list_display = ("id", "created_at", "created_by", "label", "notes", "template_str", "crawls", "num_crawls", "num_snapshots") + sort_fields = ("id", "created_at", "created_by", "label", "notes", "template_str") + search_fields = ("id", "created_by__username", "label", "notes", "schedule_id", "template_id", "template__urls") + + readonly_fields = ("created_at", "modified_at", "crawls", "snapshots") + + fieldsets = ( + ( + "Schedule Info", + { + "fields": ("label", "notes"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("schedule", "template"), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Crawls", + { + "fields": ("crawls",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), + ) + + list_filter = ("created_by",) + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related("created_by", "template") + .annotate( + crawl_count=Count("crawl", distinct=True), + snapshot_count=Count("crawl__snapshot_set", distinct=True), + ) + ) + + def get_fieldsets(self, request, obj=None): + if obj is None: + return tuple(fieldset for fieldset in self.fieldsets if fieldset[0] not in {"Crawls", "Snapshots"}) + return self.fieldsets + + def save_model(self, request, obj, form, change): + if not obj.created_by_id and getattr(request, "user", None) and request.user.is_authenticated: + obj.created_by = request.user + super().save_model(request, obj, form, change) + + @admin.display(description="Template", ordering="template") + def template_str(self, obj): + return format_html('{}', obj.template.admin_change_url, obj.template) + + @admin.display(description="# Crawls", ordering="crawl_count") + def num_crawls(self, obj): + return getattr(obj, "crawl_count", obj.crawl_set.count()) + + @admin.display(description="# Snapshots", ordering="snapshot_count") + def num_snapshots(self, obj): + return getattr(obj, "snapshot_count", Snapshot.objects.filter(crawl__schedule=obj).count()) + + def crawls(self, obj): + return format_html_join( + "
", + ' - {}', + ((crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by("-created_at")[:20]), + ) or mark_safe("No Crawls yet...") + + def snapshots(self, obj): + crawl_ids = obj.crawl_set.values_list("pk", flat=True) + return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids)) + + +def register_admin(admin_site): + admin_site.register(Crawl, CrawlAdmin) + admin_site.register(CrawlSchedule, CrawlScheduleAdmin) diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py new file mode 100644 index 0000000000..b9e5ed660f --- /dev/null +++ b/archivebox/crawls/apps.py @@ -0,0 +1,15 @@ +from django.apps import AppConfig + + +class CrawlsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.crawls" + label = "crawls" + + def ready(self): + """Import models to register state machines with the registry""" + import sys + + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.crawls.models import CrawlMachine # noqa: F401 diff --git a/archivebox/crawls/migrations/0001_initial.py b/archivebox/crawls/migrations/0001_initial.py new file mode 100644 index 0000000000..c90b52ad85 --- /dev/null +++ b/archivebox/crawls/migrations/0001_initial.py @@ -0,0 +1,177 @@ +# Generated by hand on 2025-12-29 +# Creates Crawl and CrawlSchedule tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import django.core.validators +from django.conf import settings +from archivebox.uuid_compat import uuid7 +from archivebox.base_models.models import get_or_create_system_user_pk + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("auth", "0012_alter_user_first_name_max_length"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create crawls_crawlschedule table first (circular FK will be added later) + CREATE TABLE IF NOT EXISTS crawls_crawlschedule ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + schedule VARCHAR(64) NOT NULL, + is_enabled BOOLEAN NOT NULL DEFAULT 1, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + + template_id TEXT NOT NULL, + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_at_idx ON crawls_crawlschedule(created_at); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_by_id_idx ON crawls_crawlschedule(created_by_id); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_template_id_idx ON crawls_crawlschedule(template_id); + + -- Create crawls_crawl table + CREATE TABLE IF NOT EXISTS crawls_crawl ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL, + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ); + CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status); + CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at); + CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at); + CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id); + CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id); + """, + reverse_sql=""" + DROP TABLE IF EXISTS crawls_crawl; + DROP TABLE IF EXISTS crawls_crawlschedule; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="CrawlSchedule", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("schedule", models.CharField(max_length=64)), + ("is_enabled", models.BooleanField(default=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "Scheduled Crawl", + "verbose_name_plural": "Scheduled Crawls", + "app_label": "crawls", + }, + ), + migrations.CreateModel( + name="Crawl", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("urls", models.TextField(help_text="Newline-separated list of URLs to crawl")), + ("config", models.JSONField(blank=True, default=dict, null=True)), + ( + "max_depth", + models.PositiveSmallIntegerField( + default=0, + validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)], + ), + ), + ("tags_str", models.CharField(blank=True, default="", max_length=1024)), + ("persona_id", models.UUIDField(blank=True, null=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ("output_dir", models.CharField(blank=True, default="", max_length=512)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ("retry_at", models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ( + "schedule", + models.ForeignKey( + blank=True, + editable=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="crawls.crawlschedule", + ), + ), + ], + options={ + "verbose_name": "Crawl", + "verbose_name_plural": "Crawls", + "app_label": "crawls", + }, + ), + migrations.AddField( + model_name="crawlschedule", + name="template", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="crawls.crawl"), + ), + ], + ), + ] diff --git a/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py new file mode 100644 index 0000000000..1665d62fb7 --- /dev/null +++ b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py @@ -0,0 +1,98 @@ +# Generated by hand on 2025-12-31 +# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 + +from django.db import migrations, connection + + +def upgrade_crawl_table_from_v086(apps, schema_editor): + """Upgrade crawls_crawl table from v0.8.6rc0 schema to v0.9.0 schema.""" + cursor = connection.cursor() + + # Check if crawls_crawl table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'") + if not cursor.fetchone(): + return + + # Detect schema version + cursor.execute("PRAGMA table_info(crawls_crawl)") + crawl_cols = {row[1] for row in cursor.fetchall()} + has_seed_id = "seed_id" in crawl_cols + has_urls = "urls" in crawl_cols + + # Only upgrade if we have v0.8.6rc0 schema + if not (has_seed_id and not has_urls): + return + + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + has_data = cursor.fetchone()[0] > 0 + + # v0.8.6rc0 schema - upgrade to v0.9.0 + if has_data: + print("Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0...") + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS crawls_crawl_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL, + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ); + """) + + if has_data: + cursor.execute(""" + INSERT OR IGNORE INTO crawls_crawl_new ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, config, max_depth, tags_str, persona_id, label, notes, output_dir, + status, retry_at, created_by_id, schedule_id + ) + SELECT + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + '', config, max_depth, tags_str, NULL, '', '', '', + status, retry_at, created_by_id, schedule_id + FROM crawls_crawl; + """) + + cursor.execute("DROP TABLE crawls_crawl;") + cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl;") + + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);") + + if has_data: + print("✓ crawls_crawl upgraded to v0.9.0") + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0001_initial"), + ] + + operations = [ + migrations.RunPython( + upgrade_crawl_table_from_v086, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py new file mode 100644 index 0000000000..d8d38f37cb --- /dev/null +++ b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 6.0 on 2026-01-01 23:36 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0002_upgrade_from_0_8_6"), + ] + + operations = [ + migrations.RemoveField( + model_name="crawlschedule", + name="num_uses_failed", + ), + migrations.RemoveField( + model_name="crawlschedule", + name="num_uses_succeeded", + ), + ] diff --git a/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py new file mode 100644 index 0000000000..3d68253062 --- /dev/null +++ b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py @@ -0,0 +1,16 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0003_remove_crawlschedule_num_uses_failed_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="crawl", + name="output_dir", + ), + ] diff --git a/archivebox/crawls/migrations/0005_add_crawl_limits.py b/archivebox/crawls/migrations/0005_add_crawl_limits.py new file mode 100644 index 0000000000..c931816227 --- /dev/null +++ b/archivebox/crawls/migrations/0005_add_crawl_limits.py @@ -0,0 +1,31 @@ +# Generated by Django 6.0 on 2026-03-23 00:00 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0004_remove_crawl_output_dir"), + ] + + operations = [ + migrations.AddField( + model_name="crawl", + name="max_size", + field=models.BigIntegerField( + default=0, + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + migrations.AddField( + model_name="crawl", + name="max_urls", + field=models.IntegerField( + default=0, + help_text="Maximum number of URLs to snapshot for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + ] diff --git a/archivebox/crawls/migrations/__init__.py b/archivebox/crawls/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py new file mode 100755 index 0000000000..4b5e58d5bc --- /dev/null +++ b/archivebox/crawls/models.py @@ -0,0 +1,1102 @@ +__package__ = "archivebox.crawls" + +from typing import TYPE_CHECKING +import uuid +import json +import re +from datetime import timedelta +from archivebox.uuid_compat import uuid7 +from pathlib import Path +from urllib.parse import urlparse + +from django.db import models +from django.core.validators import MaxValueValidator, MinValueValidator +from django.conf import settings +from django.urls import reverse_lazy +from django.utils import timezone +from statemachine import State, registry +from rich import print + +from archivebox.base_models.models import ( + ModelWithUUID, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, + get_or_create_system_user_pk, +) +from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine +from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + + +class CrawlSchedule(ModelWithUUID, ModelWithNotes): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + modified_at = models.DateTimeField(auto_now=True) + + template: "Crawl" = models.ForeignKey("Crawl", on_delete=models.CASCADE, null=False, blank=False) # type: ignore + schedule = models.CharField(max_length=64, blank=False, null=False) + is_enabled = models.BooleanField(default=True) + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") + + crawl_set: models.Manager["Crawl"] + + class Meta(ModelWithUUID.Meta, ModelWithNotes.Meta): + app_label = "crawls" + verbose_name = "Scheduled Crawl" + verbose_name_plural = "Scheduled Crawls" + + def __str__(self) -> str: + urls_preview = self.template.urls[:64] if self.template and self.template.urls else "" + return f"[{self.id}] {urls_preview} @ {self.schedule}" + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_any", args=[self.id])) + + def save(self, *args, **kwargs): + self.schedule = (self.schedule or "").strip() + validate_schedule(self.schedule) + self.label = self.label or (self.template.label if self.template else "") + super().save(*args, **kwargs) + if self.template: + self.template.schedule = self + self.template.save() + + @property + def last_run_at(self): + latest_crawl = self.crawl_set.order_by("-created_at").first() + if latest_crawl: + return latest_crawl.created_at + if self.template: + return self.template.created_at + return self.created_at + + @property + def next_run_at(self): + return next_run_for_schedule(self.schedule, self.last_run_at) + + def is_due(self, now=None) -> bool: + now = now or timezone.now() + return self.is_enabled and self.next_run_at <= now + + def enqueue(self, queued_at=None) -> "Crawl": + queued_at = queued_at or timezone.now() + template = self.template + label = template.label or self.label + + return Crawl.objects.create( + urls=template.urls, + config=template.config or {}, + max_depth=template.max_depth, + max_urls=template.max_urls, + max_size=template.max_size, + tags_str=template.tags_str, + persona_id=template.persona_id, + label=label, + notes=template.notes, + schedule=self, + status=Crawl.StatusChoices.QUEUED, + retry_at=queued_at, + created_by=template.created_by, + ) + + +class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + modified_at = models.DateTimeField(auto_now=True) + + urls = models.TextField(blank=False, null=False, help_text="Newline-separated list of URLs to crawl") + config = models.JSONField(default=dict, null=True, blank=True) + max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) + max_urls = models.IntegerField( + default=0, + validators=[MinValueValidator(0)], + help_text="Maximum number of URLs to snapshot for this crawl (0 = unlimited).", + ) + max_size = models.BigIntegerField( + default=0, + validators=[MinValueValidator(0)], + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + ) + tags_str = models.CharField(max_length=1024, blank=True, null=False, default="") + persona_id = models.UUIDField(null=True, blank=True) + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") + schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True) + + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) + retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + + state_machine_name = "archivebox.crawls.models.CrawlMachine" + retry_at_field_name = "retry_at" + state_field_name = "status" + StatusChoices = ModelWithStateMachine.StatusChoices + active_state = StatusChoices.STARTED + + schedule_id: uuid.UUID | None + sm: "CrawlMachine" + + snapshot_set: models.Manager["Snapshot"] + + class Meta( + ModelWithOutputDir.Meta, + ModelWithConfig.Meta, + ModelWithHealthStats.Meta, + ModelWithStateMachine.Meta, + ): + app_label = "crawls" + verbose_name = "Crawl" + verbose_name_plural = "Crawls" + + def __str__(self): + first_url = self.get_urls_list()[0] if self.get_urls_list() else "" + # Show last 8 digits of UUID and more of the URL + short_id = str(self.id)[-8:] + return f"[...{short_id}] {first_url[:120]}" + + def save(self, *args, **kwargs): + config = dict(self.config or {}) + if self.max_urls > 0: + config["MAX_URLS"] = self.max_urls + else: + config.pop("MAX_URLS", None) + + if self.max_size > 0: + config["MAX_SIZE"] = self.max_size + else: + config.pop("MAX_SIZE", None) + + if config != (self.config or {}): + self.config = config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + + super().save(*args, **kwargs) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + # log_worker_event( + # worker_type='DB', + # event='Created Crawl', + # indent_level=1, + # metadata={ + # 'id': str(self.id), + # 'first_url': first_url[:64], + # 'max_depth': self.max_depth, + # 'status': self.status, + # }, + # ) + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_crawl", args=[self.id])) + + def to_json(self) -> dict: + """ + Convert Crawl model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + return { + "type": "Crawl", + "schema_version": VERSION, + "id": str(self.id), + "urls": self.urls, + "status": self.status, + "max_depth": self.max_depth, + "max_urls": self.max_urls, + "max_size": self.max_size, + "tags_str": self.tags_str, + "label": self.label, + "created_at": self.created_at.isoformat() if self.created_at else None, + } + + @staticmethod + def from_json(record: dict, overrides: dict | None = None): + """ + Create or get a Crawl from a JSON dict. + + Args: + record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label' + overrides: Dict of field overrides (e.g., created_by_id) + + Returns: + Crawl instance or None if invalid + """ + from django.utils import timezone + + overrides = overrides or {} + + # Check if crawl already exists by ID + crawl_id = record.get("id") + if crawl_id: + try: + return Crawl.objects.get(id=crawl_id) + except Crawl.DoesNotExist: + pass + + # Get URLs - can be string (newline-separated) or from 'url' field + urls = record.get("urls", "") + if not urls and record.get("url"): + urls = record["url"] + + if not urls: + return None + + # Create new crawl (status stays QUEUED, not started) + crawl = Crawl.objects.create( + urls=urls, + max_depth=record.get("max_depth", record.get("depth", 0)), + max_urls=record.get("max_urls", 0), + max_size=record.get("max_size", 0), + tags_str=record.get("tags_str", record.get("tags", "")), + label=record.get("label", ""), + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + **overrides, + ) + return crawl + + @property + def output_dir(self) -> Path: + """ + Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id} + Domain is extracted from the first URL in the crawl. + """ + from archivebox import DATA_DIR + from archivebox.core.models import Snapshot + + date_str = self.created_at.strftime("%Y%m%d") + urls = self.get_urls_list() + domain = Snapshot.extract_domain_from_url(urls[0]) if urls else "unknown" + + return DATA_DIR / "users" / self.created_by.username / "crawls" / date_str / domain / str(self.id) + + def get_urls_list(self) -> list[str]: + """Get list of URLs from urls field, filtering out comments and empty lines.""" + if not self.urls: + return [] + return [url.strip() for url in self.urls.split("\n") if url.strip() and not url.strip().startswith("#")] + + @staticmethod + def normalize_domain(value: str) -> str: + candidate = (value or "").strip().lower() + if not candidate: + return "" + if "://" not in candidate and "/" not in candidate: + candidate = f"https://{candidate.lstrip('.')}" + try: + parsed = urlparse(candidate) + hostname = parsed.hostname or "" + if not hostname: + return "" + if parsed.port: + return f"{hostname}_{parsed.port}" + return hostname + except Exception: + return "" + + @staticmethod + def split_filter_patterns(value) -> list[str]: + patterns = [] + seen = set() + if isinstance(value, list): + raw_values = value + elif isinstance(value, str): + raw_values = value.splitlines() + else: + raw_values = [] + + for raw_value in raw_values: + pattern = str(raw_value or "").strip() + if not pattern or pattern in seen: + continue + seen.add(pattern) + patterns.append(pattern) + return patterns + + @classmethod + def _pattern_matches_url(cls, url: str, pattern: str) -> bool: + normalized_pattern = str(pattern or "").strip() + if not normalized_pattern: + return False + + if re.fullmatch(r"[\w.*:-]+", normalized_pattern): + wildcard_only_subdomains = normalized_pattern.startswith("*.") + normalized_domain = cls.normalize_domain( + normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern, + ) + normalized_url_domain = cls.normalize_domain(url) + if not normalized_domain or not normalized_url_domain: + return False + + pattern_host = normalized_domain.split("_", 1)[0] + url_host = normalized_url_domain.split("_", 1)[0] + + if wildcard_only_subdomains: + return url_host.endswith(f".{pattern_host}") + + if normalized_url_domain == normalized_domain: + return True + return url_host == pattern_host or url_host.endswith(f".{pattern_host}") + + try: + return bool(re.search(normalized_pattern, url)) + except re.error: + return False + + def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + from archivebox.config.configset import get_config + + config = get_config(crawl=self, snapshot=snapshot) + else: + config = self.config or {} + return self.split_filter_patterns(config.get("URL_ALLOWLIST", "")) + + def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + from archivebox.config.configset import get_config + + config = get_config(crawl=self, snapshot=snapshot) + else: + config = self.config or {} + return self.split_filter_patterns(config.get("URL_DENYLIST", "")) + + def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool: + denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot) + allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot) + + for pattern in denylist: + if self._pattern_matches_url(url, pattern): + return False + + if allowlist: + return any(self._pattern_matches_url(url, pattern) for pattern in allowlist) + + return True + + def set_url_filters(self, allowlist, denylist) -> None: + config = dict(self.config or {}) + allow_patterns = self.split_filter_patterns(allowlist) + deny_patterns = self.split_filter_patterns(denylist) + + if allow_patterns: + config["URL_ALLOWLIST"] = "\n".join(allow_patterns) + else: + config.pop("URL_ALLOWLIST", None) + + if deny_patterns: + config["URL_DENYLIST"] = "\n".join(deny_patterns) + else: + config.pop("URL_DENYLIST", None) + + self.config = config + + def apply_crawl_config_filters(self) -> dict[str, int]: + from archivebox.core.models import Snapshot + + removed_urls = self.prune_urls( + lambda url: not self.url_passes_filters(url, use_effective_config=False), + ) + + filtered_snapshots = [ + snapshot + for snapshot in self.snapshot_set.filter( + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).only("pk", "url", "status") + if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False) + ] + + deleted_snapshots = 0 + if filtered_snapshots: + started_snapshots = [snapshot for snapshot in filtered_snapshots if snapshot.status == Snapshot.StatusChoices.STARTED] + for snapshot in started_snapshots: + snapshot.cancel_running_hooks() + + filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots] + deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete() + + return { + "removed_urls": len(removed_urls), + "deleted_snapshots": deleted_snapshots, + } + + def _iter_url_lines(self) -> list[tuple[str, str]]: + entries: list[tuple[str, str]] = [] + for raw_line in (self.urls or "").splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + if stripped.startswith("#"): + entries.append((raw_line.rstrip(), "")) + continue + try: + entry = json.loads(stripped) + entries.append((raw_line.rstrip(), str(entry.get("url", "") or "").strip())) + except json.JSONDecodeError: + entries.append((raw_line.rstrip(), stripped)) + return entries + + def prune_urls(self, predicate) -> list[str]: + kept_lines: list[str] = [] + removed_urls: list[str] = [] + + for raw_line, url in self._iter_url_lines(): + if not url: + kept_lines.append(raw_line) + continue + if predicate(url): + removed_urls.append(url) + continue + kept_lines.append(raw_line) + + next_urls = "\n".join(kept_lines) + if next_urls != (self.urls or ""): + self.urls = next_urls + self.save(update_fields=["urls", "modified_at"]) + return removed_urls + + def prune_url(self, url: str) -> int: + target = (url or "").strip() + removed = self.prune_urls(lambda candidate: candidate == target) + return len(removed) + + def exclude_domain(self, domain: str) -> dict[str, int | str | bool]: + normalized_domain = self.normalize_domain(domain) + if not normalized_domain: + return { + "domain": "", + "created": False, + "removed_urls": 0, + "deleted_snapshots": 0, + } + + domains = self.get_url_denylist(use_effective_config=False) + created = normalized_domain not in domains + if created: + domains.append(normalized_domain) + self.set_url_filters( + self.get_url_allowlist(use_effective_config=False), + domains, + ) + self.save(update_fields=["config", "modified_at"]) + + filter_result = self.apply_crawl_config_filters() + + return { + "domain": normalized_domain, + "created": created, + "removed_urls": filter_result["removed_urls"], + "deleted_snapshots": filter_result["deleted_snapshots"], + } + + def get_system_task(self) -> str | None: + urls = self.get_urls_list() + if len(urls) != 1: + return None + system_url = urls[0].strip().lower() + if system_url.startswith("archivebox://"): + return system_url + return None + + def resolve_persona(self): + from archivebox.personas.models import Persona + + if self.persona_id: + persona = Persona.objects.filter(id=self.persona_id).first() + if persona is None: + raise Persona.DoesNotExist(f"Crawl {self.id} references missing Persona {self.persona_id}") + return persona + + default_persona_name = str((self.config or {}).get("DEFAULT_PERSONA") or "").strip() + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default") + return persona + + return None + + def add_url(self, entry: dict) -> bool: + """ + Add a URL to the crawl queue if not already present. + + Args: + entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'plugin' + + Returns: + True if URL was added, False if skipped (duplicate or depth exceeded) + """ + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) + if not url: + return False + if not self.url_passes_filters(url): + return False + + depth = entry.get("depth", 1) + + # Skip if depth exceeds max_depth + if depth > self.max_depth: + return False + + # Skip if already a Snapshot for this crawl + if self.snapshot_set.filter(url=url).exists(): + return False + + # Check if already in urls (parse existing JSONL entries) + existing_urls = {url for _raw_line, url in self._iter_url_lines() if url} + + if url in existing_urls: + return False + + # Append as JSONL + entry = {**entry, "url": url} + jsonl_entry = json.dumps(entry) + self.urls = (self.urls.rstrip() + "\n" + jsonl_entry).lstrip("\n") + self.save(update_fields=["urls", "modified_at"]) + return True + + def create_snapshots_from_urls(self) -> list["Snapshot"]: + """ + Create Snapshot objects for each URL in self.urls that doesn't already exist. + + Returns: + List of newly created Snapshot objects + """ + from archivebox.core.models import Snapshot + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + created_snapshots = [] + + for line in self.urls.splitlines(): + if not line.strip(): + continue + + # Parse JSONL or plain URL + try: + entry = json.loads(line) + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) + depth = entry.get("depth", 0) + title = entry.get("title") + timestamp = entry.get("timestamp") + tags = entry.get("tags", "") + except json.JSONDecodeError: + url = sanitize_extracted_url(fix_url_from_markdown(line.strip())) + depth = 0 + title = None + timestamp = None + tags = self.tags_str + + if not url: + continue + if not self.url_passes_filters(url): + continue + + # Skip if depth exceeds max_depth + if depth > self.max_depth: + continue + + # Create snapshot if doesn't exist + snapshot, created = Snapshot.objects.get_or_create( + url=url, + crawl=self, + defaults={ + "depth": depth, + "title": title, + "timestamp": timestamp or str(timezone.now().timestamp()), + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), + # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl + }, + ) + + if created: + created_snapshots.append(snapshot) + # Save tags if present + if tags: + snapshot.save_tags(tags.split(",")) + + # Ensure crawl -> snapshot symlink exists for both new and existing snapshots + try: + snapshot.ensure_crawl_symlink() + except Exception: + pass + + return created_snapshots + + def install_declared_binaries(self, binary_names: set[str], machine=None) -> None: + """ + Install crawl-declared Binary rows without violating the retry_at lock lifecycle. + + Correct calling pattern: + 1. Crawl hooks declare Binary records and queue them with retry_at <= now + 2. Exactly one actor claims each Binary by moving retry_at into the future + 3. Only that owner executes `.sm.tick()` and performs install side effects + 4. Everyone else waits for the claimed owner to finish instead of launching + a second install against shared state such as the pip or npm trees + + This helper follows that contract by claiming each Binary before ticking + it, and by waiting when another worker already owns the row. That keeps + synchronous crawl execution compatible with the shared background runner and + avoids duplicate installs of the same dependency. + """ + import time + from archivebox.machine.models import Binary, Machine + + if not binary_names: + return + + machine = machine or Machine.current() + lock_seconds = 600 + deadline = time.monotonic() + max(lock_seconds, len(binary_names) * lock_seconds) + + while time.monotonic() < deadline: + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if not unresolved_binaries: + return + + claimed_any = False + waiting_on_existing_owner = False + now = timezone.now() + + for binary in unresolved_binaries: + try: + if binary.tick_claimed(lock_seconds=lock_seconds): + claimed_any = True + continue + except Exception: + claimed_any = True + continue + + binary.refresh_from_db() + if binary.status == Binary.StatusChoices.INSTALLED: + claimed_any = True + continue + if binary.retry_at and binary.retry_at > now: + waiting_on_existing_owner = True + + if claimed_any: + continue + if waiting_on_existing_owner: + time.sleep(0.5) + continue + break + + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if unresolved_binaries: + binary_details = ", ".join( + f"{binary.name} (status={binary.status}, retry_at={binary.retry_at})" for binary in unresolved_binaries + ) + raise RuntimeError( + f"Crawl dependencies failed to install before continuing: {binary_details}", + ) + + def run(self) -> "Snapshot | None": + """ + Execute this Crawl: run hooks, process JSONL, create snapshots. + + Called by the state machine when entering the 'started' state. + + Returns: + The root Snapshot for this crawl, or None for system crawls that don't create snapshots + """ + import time + from pathlib import Path + from archivebox.hooks import run_hook, discover_hooks, process_hook_records, is_finite_background_hook + from archivebox.config.configset import get_config + from archivebox.machine.models import Binary, Machine + + # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode) + debug_log = Path("/tmp/archivebox_crawl_debug.log") + with open(debug_log, "a") as f: + f.write(f"\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n") + f.flush() + + def get_runtime_config(): + config = get_config(crawl=self) + if persona_runtime_overrides: + config.update(persona_runtime_overrides) + return config + + system_task = self.get_system_task() + if system_task == "archivebox://update": + from archivebox.cli.archivebox_update import process_all_db_snapshots + + process_all_db_snapshots() + return None + + machine = Machine.current() + declared_binary_names: set[str] = set() + persona_runtime_overrides: dict[str, str] = {} + persona = self.resolve_persona() + if persona: + base_runtime_config = get_config(crawl=self, persona=persona) + chrome_binary = str(base_runtime_config.get("CHROME_BINARY") or "") + persona_runtime_overrides = persona.prepare_runtime_for_crawl( + crawl=self, + chrome_binary=chrome_binary, + ) + + executed_crawl_hooks: set[str] = set() + + def run_crawl_hook(hook: Path) -> set[str]: + executed_crawl_hooks.add(str(hook)) + primary_url = next( + (line.strip() for line in self.urls.splitlines() if line.strip()), + self.urls.strip(), + ) + + with open(debug_log, "a") as f: + f.write(f"Running hook: {hook.name}\n") + f.flush() + hook_start = time.time() + plugin_name = hook.parent.name + output_dir = self.output_dir / plugin_name + output_dir.mkdir(parents=True, exist_ok=True) + + process = run_hook( + hook, + output_dir=output_dir, + config=get_runtime_config(), + crawl_id=str(self.id), + source_url=self.urls, + url=primary_url, + snapshot_id=str(self.id), + ) + with open(debug_log, "a") as f: + f.write(f"Hook {hook.name} completed with status={process.status}\n") + f.flush() + + hook_elapsed = time.time() - hook_start + if hook_elapsed > 0.5: + print(f"[yellow]âąī¸ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]") + + if process.status == process.StatusChoices.RUNNING: + if not is_finite_background_hook(hook.name): + return set() + try: + process.wait(timeout=process.timeout) + except Exception: + return set() + + from archivebox.hooks import extract_records_from_process + + records = [] + # Finite background hooks can exit before their stdout log is fully + # visible to our polling loop. Give successful hooks a brief chance + # to flush JSONL records before we move on to downstream hooks. + for delay in (0.0, 0.05, 0.1, 0.25, 0.5): + if delay: + time.sleep(delay) + records = extract_records_from_process(process) + if records: + break + if records: + print(f"[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]") + for record in records[:3]: + print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}") + if system_task: + records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")] + overrides = {"crawl": self} + stats = process_hook_records(records, overrides=overrides) + if stats: + print(f"[green]✓ Created: {stats}[/green]") + + hook_binary_names = { + str(record.get("name")).strip() + for record in records + if record.get("type") in ("BinaryRequest", "Binary") and record.get("name") + } + hook_binary_names.discard("") + if hook_binary_names: + declared_binary_names.update(hook_binary_names) + return hook_binary_names + + def resolve_provider_binaries(binary_names: set[str]) -> set[str]: + if not binary_names: + return set() + + resolved_binary_names = set(binary_names) + + while True: + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=resolved_binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if not unresolved_binaries: + return resolved_binary_names + + needed_provider_names: set[str] = set() + for binary in unresolved_binaries: + allowed_binproviders = binary._allowed_binproviders() + if allowed_binproviders is None: + continue + needed_provider_names.update(allowed_binproviders) + + if not needed_provider_names: + return resolved_binary_names + + provider_hooks = [ + hook + for hook in discover_hooks("Crawl", filter_disabled=False, config=get_runtime_config()) + if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks + ] + if not provider_hooks: + return resolved_binary_names + + for hook in provider_hooks: + resolved_binary_names.update(run_crawl_hook(hook)) + + # Discover and run on_Crawl hooks + with open(debug_log, "a") as f: + f.write("Discovering Crawl hooks...\n") + f.flush() + hooks = discover_hooks("Crawl", config=get_runtime_config()) + with open(debug_log, "a") as f: + f.write(f"Found {len(hooks)} hooks\n") + f.flush() + + for hook in hooks: + hook_binary_names = run_crawl_hook(hook) + if hook_binary_names: + self.install_declared_binaries(resolve_provider_binaries(hook_binary_names), machine=machine) + + # Safety check: don't create snapshots if any crawl-declared dependency + # is still unresolved after all crawl hooks have run. + self.install_declared_binaries(declared_binary_names, machine=machine) + + # Create snapshots from all URLs in self.urls + if system_task: + leaked_snapshots = self.snapshot_set.all() + if leaked_snapshots.exists(): + leaked_count = leaked_snapshots.count() + leaked_snapshots.delete() + print(f"[yellow]âš ī¸ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]") + with open(debug_log, "a") as f: + f.write(f"Skipping snapshot creation for system crawl: {system_task}\n") + f.write("=== Crawl.run() complete ===\n\n") + f.flush() + return None + + with open(debug_log, "a") as f: + f.write("Creating snapshots from URLs...\n") + f.flush() + created_snapshots = self.create_snapshots_from_urls() + with open(debug_log, "a") as f: + f.write(f"Created {len(created_snapshots)} snapshots\n") + f.write("=== Crawl.run() complete ===\n\n") + f.flush() + + # Return first snapshot for this crawl (newly created or existing) + # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created + return self.snapshot_set.first() + + def is_finished(self) -> bool: + """Check if crawl is finished (all snapshots sealed or no snapshots exist).""" + from archivebox.core.models import Snapshot + + # Check if any snapshots exist for this crawl + snapshots = Snapshot.objects.filter(crawl=self) + + # If no snapshots exist, allow finishing (e.g., system crawls that only run setup hooks) + if not snapshots.exists(): + return True + + # If snapshots exist, check if all are sealed + if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists(): + return False + + return True + + def cleanup(self): + """Clean up background hooks and run on_CrawlEnd hooks.""" + from archivebox.hooks import run_hook, discover_hooks + + # Clean up .pid files from output directory + if self.output_dir.exists(): + for pid_file in self.output_dir.glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + persona = self.resolve_persona() + if persona: + persona.cleanup_runtime_for_crawl(self) + + # Run on_CrawlEnd hooks + from archivebox.config.configset import get_config + + config = get_config(crawl=self) + + hooks = discover_hooks("CrawlEnd", config=config) + + for hook in hooks: + plugin_name = hook.parent.name + output_dir = self.output_dir / plugin_name + output_dir.mkdir(parents=True, exist_ok=True) + + process = run_hook( + hook, + output_dir=output_dir, + config=config, + crawl_id=str(self.id), + source_url=self.urls, # Pass full newline-separated URLs + ) + + # Log failures but don't block + if process.exit_code != 0: + print(f"[yellow]âš ī¸ CrawlEnd hook failed: {hook.name}[/yellow]") + + +# ============================================================================= +# State Machines +# ============================================================================= + + +class CrawlMachine(BaseStateMachine): + crawl: Crawl + + """ + State machine for managing Crawl lifecycle. + + Hook Lifecycle: + ┌─────────────────────────────────────────────────────────────┐ + │ QUEUED State │ + │ â€ĸ Waiting for crawl to be ready (has URLs) │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when can_start() + ┌─────────────────────────────────────────────────────────────┐ + │ STARTED State → enter_started() │ + │ 1. crawl.run() │ + │ â€ĸ discover_hooks('Crawl') → finds all crawl hooks │ + │ â€ĸ For each hook: │ + │ - run_hook(script, output_dir, ...) │ + │ - Parse JSONL from hook output │ + │ - process_hook_records() → creates Snapshots │ + │ â€ĸ create_snapshots_from_urls() → from self.urls field │ + │ │ + │ 2. Snapshots process independently with their own │ + │ state machines (see SnapshotMachine) │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when is_finished() + ┌─────────────────────────────────────────────────────────────┐ + │ SEALED State → enter_sealed() │ + │ â€ĸ cleanup() → runs on_CrawlEnd hooks, kills background │ + │ â€ĸ Set retry_at=None (no more processing) │ + └─────────────────────────────────────────────────────────────┘ + """ + + model_attr_name = "crawl" + + # States + queued = State(value=Crawl.StatusChoices.QUEUED, initial=True) + started = State(value=Crawl.StatusChoices.STARTED) + sealed = State(value=Crawl.StatusChoices.SEALED, final=True) + + # Tick Event (polled by workers) + tick = queued.to.itself(unless="can_start") | queued.to(started, cond="can_start") | started.to(sealed, cond="is_finished") + + # Manual event (triggered by last Snapshot sealing) + seal = started.to(sealed) + + def can_start(self) -> bool: + if not self.crawl.urls: + print(f"[red]âš ī¸ Crawl {self.crawl.id} cannot start: no URLs[/red]") + return False + urls_list = self.crawl.get_urls_list() + if not urls_list: + print(f"[red]âš ī¸ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]") + return False + return True + + def is_finished(self) -> bool: + """Check if all Snapshots for this crawl are finished.""" + return self.crawl.is_finished() + + @started.enter + def enter_started(self): + import sys + + print(f"[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]", file=sys.stderr) + + try: + # Run the crawl - runs hooks, processes JSONL, creates snapshots + first_snapshot = self.crawl.run() + + if first_snapshot: + print( + f"[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]", + file=sys.stderr, + ) + # Update status to STARTED + # Set retry_at to near future so tick() can poll and check is_finished() + self.crawl.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=2), + status=Crawl.StatusChoices.STARTED, + ) + else: + # No snapshots (system crawl that only runs setup hooks) + print("[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]", file=sys.stderr) + # Seal immediately since there's no work to do + self.seal() + + except Exception as e: + print(f"[red]âš ī¸ Crawl {self.crawl.id} failed to start: {e}[/red]") + import traceback + + traceback.print_exc() + raise + + @sealed.enter + def enter_sealed(self): + # Clean up background hooks and run on_CrawlEnd hooks + self.crawl.cleanup() + + self.crawl.update_and_requeue( + retry_at=None, + status=Crawl.StatusChoices.SEALED, + ) + + +# ============================================================================= +# Register State Machines +# ============================================================================= + +# Manually register state machines with python-statemachine registry +# (normally auto-discovered from statemachines.py, but we define them here for clarity) +registry.register(CrawlMachine) diff --git a/archivebox/crawls/schedule_utils.py b/archivebox/crawls/schedule_utils.py new file mode 100644 index 0000000000..a5307f990b --- /dev/null +++ b/archivebox/crawls/schedule_utils.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from datetime import datetime + +from croniter import croniter + + +SCHEDULE_ALIASES: dict[str, str] = { + "minute": "* * * * *", + "minutely": "* * * * *", + "hour": "0 * * * *", + "hourly": "0 * * * *", + "day": "0 0 * * *", + "daily": "0 0 * * *", + "week": "0 0 * * 0", + "weekly": "0 0 * * 0", + "month": "0 0 1 * *", + "monthly": "0 0 1 * *", + "year": "0 0 1 1 *", + "yearly": "0 0 1 1 *", +} + + +def normalize_schedule(schedule: str) -> str: + normalized = (schedule or "").strip() + if not normalized: + raise ValueError("Schedule cannot be empty.") + + return SCHEDULE_ALIASES.get(normalized.lower(), normalized) + + +def validate_schedule(schedule: str) -> str: + normalized = normalize_schedule(schedule) + if not croniter.is_valid(normalized): + raise ValueError( + "Invalid schedule. Use an alias like daily/weekly/monthly or a cron expression such as '0 */6 * * *'.", + ) + return normalized + + +def next_run_for_schedule(schedule: str, after: datetime) -> datetime: + normalized = validate_schedule(schedule) + return croniter(normalized, after).get_next(datetime) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py deleted file mode 100644 index c0e0c433f0..0000000000 --- a/archivebox/extractors/__init__.py +++ /dev/null @@ -1,190 +0,0 @@ -__package__ = 'archivebox.extractors' - -import os -from pathlib import Path - -from typing import Optional, List, Iterable, Union -from datetime import datetime, timezone -from django.db.models import QuerySet - -from ..index.schema import Link -from ..index.sql import write_link_to_sql_index -from ..index import ( - load_link_details, - write_link_details, -) -from ..util import enforce_types -from ..logging_util import ( - log_archiving_started, - log_archiving_paused, - log_archiving_finished, - log_link_archiving_started, - log_link_archiving_finished, - log_archive_method_started, - log_archive_method_finished, -) -from ..search import write_search_index - -from .title import should_save_title, save_title -from .favicon import should_save_favicon, save_favicon -from .wget import should_save_wget, save_wget -from .singlefile import should_save_singlefile, save_singlefile -from .readability import should_save_readability, save_readability -from .mercury import should_save_mercury, save_mercury -from .pdf import should_save_pdf, save_pdf -from .screenshot import should_save_screenshot, save_screenshot -from .dom import should_save_dom, save_dom -from .git import should_save_git, save_git -from .media import should_save_media, save_media -from .archive_org import should_save_archive_dot_org, save_archive_dot_org -from .headers import should_save_headers, save_headers - - -def get_default_archive_methods(): - return [ - ('title', should_save_title, save_title), - ('favicon', should_save_favicon, save_favicon), - ('headers', should_save_headers, save_headers), - ('singlefile', should_save_singlefile, save_singlefile), - ('pdf', should_save_pdf, save_pdf), - ('screenshot', should_save_screenshot, save_screenshot), - ('dom', should_save_dom, save_dom), - ('wget', should_save_wget, save_wget), - ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them - ('mercury', should_save_mercury, save_mercury), - ('git', should_save_git, save_git), - ('media', should_save_media, save_media), - ('archive_org', should_save_archive_dot_org, save_archive_dot_org), - ] - -ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] - -@enforce_types -def ignore_methods(to_ignore: List[str]): - ARCHIVE_METHODS = get_default_archive_methods() - methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) - methods = map(lambda x: x[0], methods) - return list(methods) - -@enforce_types -def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: - """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach. - from core.models import Snapshot, ArchiveResult - try: - snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot - except Snapshot.DoesNotExist: - snapshot = write_link_to_sql_index(link) - - ARCHIVE_METHODS = get_default_archive_methods() - - if methods: - ARCHIVE_METHODS = [ - method for method in ARCHIVE_METHODS - if method[0] in methods - ] - - out_dir = out_dir or Path(link.link_dir) - try: - is_new = not Path(out_dir).exists() - if is_new: - os.makedirs(out_dir) - - link = load_link_details(link, out_dir=out_dir) - write_link_details(link, out_dir=out_dir, skip_sql_index=False) - log_link_archiving_started(link, out_dir, is_new) - link = link.overwrite(updated=datetime.now(timezone.utc)) - stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} - start_ts = datetime.now(timezone.utc) - - for method_name, should_run, method_function in ARCHIVE_METHODS: - try: - if method_name not in link.history: - link.history[method_name] = [] - - if should_run(link, out_dir, overwrite): - log_archive_method_started(method_name) - - result = method_function(link=link, out_dir=out_dir) - - link.history[method_name].append(result) - - stats[result.status] += 1 - log_archive_method_finished(result) - write_search_index(link=link, texts=result.index_texts) - ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version, - output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status) - - - # bump the updated time on the main Snapshot here, this is critical - # to be able to cache summaries of the ArchiveResults for a given - # snapshot without having to load all the results from the DB each time. - # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume - # ArchiveResults are unchanged as long as the updated timestamp is unchanged) - snapshot.save() - else: - # print('{black} X {}{reset}'.format(method_name, **ANSI)) - stats['skipped'] += 1 - except Exception as e: - raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( - method_name, - link.url, - )) from e - - # print(' ', stats) - - try: - latest_title = link.history['title'][-1].output.strip() - if latest_title and len(latest_title) >= len(link.title or ''): - link = link.overwrite(title=latest_title) - except Exception: - pass - - write_link_details(link, out_dir=out_dir, skip_sql_index=False) - - log_link_archiving_finished(link, link.link_dir, is_new, stats, start_ts) - - except KeyboardInterrupt: - try: - write_link_details(link, out_dir=link.link_dir) - except: - pass - raise - - except Exception as err: - print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) - raise - - return link - -@enforce_types -def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]: - - if type(all_links) is QuerySet: - num_links: int = all_links.count() - get_link = lambda x: x.as_link() - all_links = all_links.iterator() - else: - num_links: int = len(all_links) - get_link = lambda x: x - - if num_links == 0: - return [] - - log_archiving_started(num_links) - idx: int = 0 - try: - for link in all_links: - idx += 1 - to_archive = get_link(link) - archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir)) - except KeyboardInterrupt: - log_archiving_paused(num_links, idx, link.timestamp) - raise SystemExit(0) - except BaseException: - print() - raise - - log_archiving_finished(num_links) - return all_links diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py deleted file mode 100644 index a088311355..0000000000 --- a/archivebox/extractors/archive_org.py +++ /dev/null @@ -1,112 +0,0 @@ -__package__ = 'archivebox.extractors' - - -from pathlib import Path -from typing import Optional, List, Dict, Tuple -from collections import defaultdict - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, -) -from ..config import ( - TIMEOUT, - CURL_ARGS, - CHECK_SSL_VALIDITY, - SAVE_ARCHIVE_DOT_ORG, - CURL_BINARY, - CURL_VERSION, - CURL_USER_AGENT, -) -from ..logging_util import TimedProgress - - - -@enforce_types -def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'archive.org.txt').exists(): - # if open(path, 'r', encoding='utf-8').read().strip() != 'None': - return False - - return SAVE_ARCHIVE_DOT_ORG - -@enforce_types -def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """submit site to archive.org for archiving via their service, save returned archive url""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'archive.org.txt' - archive_org_url = None - submit_url = 'https://web.archive.org/save/{}'.format(link.url) - cmd = [ - CURL_BINARY, - *CURL_ARGS, - '--head', - '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - submit_url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - content_location, errors = parse_archive_dot_org_response(result.stdout) - if content_location: - archive_org_url = content_location[0] - elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: - archive_org_url = None - # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url))) - elif errors: - raise ArchiveError(', '.join(errors)) - else: - raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - if output and not isinstance(output, Exception): - # instead of writing None when archive.org rejects the url write the - # url to resubmit it to archive.org. This is so when the user visits - # the URL in person, it will attempt to re-archive it, and it'll show the - # nicer error message explaining why the url was rejected if it fails. - archive_org_url = archive_org_url or submit_url - with open(str(out_dir / output), 'w', encoding='utf-8') as f: - f.write(archive_org_url) - chmod_file('archive.org.txt', cwd=str(out_dir)) - output = archive_org_url - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) - -@enforce_types -def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]: - # Parse archive.org response headers - headers: Dict[str, List[str]] = defaultdict(list) - - # lowercase all the header names and store in dict - for header in response.splitlines(): - if b':' not in header or not header.strip(): - continue - name, val = header.decode().split(':', 1) - headers[name.lower().strip()].append(val.strip()) - - # Get successful archive url in "content-location" header or any errors - content_location = headers.get('content-location', headers['location']) - errors = headers['x-archive-wayback-runtime-error'] - return content_location, errors - diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py deleted file mode 100644 index ec2df073ff..0000000000 --- a/archivebox/extractors/dom.py +++ /dev/null @@ -1,69 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file, atomic_write -from ..util import ( - enforce_types, - is_static_file, - chrome_args, -) -from ..config import ( - TIMEOUT, - SAVE_DOM, - CHROME_VERSION, -) -from ..logging_util import TimedProgress - - - -@enforce_types -def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.html').exists(): - return False - - return SAVE_DOM - -@enforce_types -def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """print HTML of site to file using chrome --dump-html""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.html' - output_path = out_dir / output - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--dump-dom', - link.url - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - atomic_write(output_path, result.stdout) - - if result.returncode: - hints = result.stderr.decode() - raise ArchiveError('Failed to save DOM', hints) - - chmod_file(output, cwd=str(out_dir)) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py deleted file mode 100644 index b8831d0cf6..0000000000 --- a/archivebox/extractors/favicon.py +++ /dev/null @@ -1,63 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path - -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput -from ..system import chmod_file, run -from ..util import enforce_types, domain -from ..config import ( - TIMEOUT, - SAVE_FAVICON, - CURL_BINARY, - CURL_ARGS, - CURL_VERSION, - CHECK_SSL_VALIDITY, - CURL_USER_AGENT, -) -from ..logging_util import TimedProgress - - -@enforce_types -def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'favicon.ico').exists(): - return False - - return SAVE_FAVICON - -@enforce_types -def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download site favicon from google's favicon api""" - - out_dir = out_dir or link.link_dir - output: ArchiveOutput = 'favicon.ico' - cmd = [ - CURL_BINARY, - *CURL_ARGS, - '--max-time', str(timeout), - '--output', str(output), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), - ] - status = 'failed' - timer = TimedProgress(timeout, prefix=' ') - try: - run(cmd, cwd=str(out_dir), timeout=timeout) - chmod_file(output, cwd=str(out_dir)) - status = 'succeeded' - except Exception as err: - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py deleted file mode 100644 index efef37c25d..0000000000 --- a/archivebox/extractors/git.py +++ /dev/null @@ -1,90 +0,0 @@ -__package__ = 'archivebox.extractors' - - -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - domain, - extension, - without_query, - without_fragment, -) -from ..config import ( - TIMEOUT, - SAVE_GIT, - GIT_BINARY, - GIT_ARGS, - GIT_VERSION, - GIT_DOMAINS, - CHECK_SSL_VALIDITY -) -from ..logging_util import TimedProgress - - - -@enforce_types -def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'git').exists(): - return False - - is_clonable_url = ( - (domain(link.url) in GIT_DOMAINS) - or (extension(link.url) == 'git') - ) - if not is_clonable_url: - return False - - return SAVE_GIT - - -@enforce_types -def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download full site using git""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'git' - output_path = out_dir / output - output_path.mkdir(exist_ok=True) - cmd = [ - GIT_BINARY, - 'clone', - *GIT_ARGS, - *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']), - without_query(without_fragment(link.url)), - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(output_path), timeout=timeout + 1) - if result.returncode == 128: - # ignore failed re-download when the folder already exists - pass - elif result.returncode > 0: - hints = 'Got git response code: {}.'.format(result.returncode) - raise ArchiveError('Failed to save git clone', hints) - - chmod_file(output, cwd=str(out_dir)) - - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=GIT_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py deleted file mode 100644 index 91dcb8e3a1..0000000000 --- a/archivebox/extractors/headers.py +++ /dev/null @@ -1,70 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path - -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput -from ..system import atomic_write -from ..util import ( - enforce_types, - get_headers, -) -from ..config import ( - TIMEOUT, - CURL_BINARY, - CURL_ARGS, - CURL_USER_AGENT, - CURL_VERSION, - CHECK_SSL_VALIDITY, - SAVE_HEADERS -) -from ..logging_util import TimedProgress - -@enforce_types -def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'headers.json').exists(): - return False - - return SAVE_HEADERS - - -@enforce_types -def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """Download site headers""" - - out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() - output: ArchiveOutput = 'headers.json' - - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - - cmd = [ - CURL_BINARY, - *CURL_ARGS, - '--head', - '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - link.url, - ] - try: - json_headers = get_headers(link.url, timeout=timeout) - output_folder.mkdir(exist_ok=True) - atomic_write(str(output_folder / "headers.json"), json_headers) - except (Exception, OSError) as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py deleted file mode 100644 index e41a4002f8..0000000000 --- a/archivebox/extractors/media.py +++ /dev/null @@ -1,93 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, -) -from ..config import ( - MEDIA_TIMEOUT, - SAVE_MEDIA, - YOUTUBEDL_ARGS, - YOUTUBEDL_BINARY, - YOUTUBEDL_VERSION, - CHECK_SSL_VALIDITY -) -from ..logging_util import TimedProgress - - -@enforce_types -def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'media').exists(): - return False - - return SAVE_MEDIA - -@enforce_types -def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: - """Download playlists or individual video, audio, and subtitles using youtube-dl""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'media' - output_path = out_dir / output - output_path.mkdir(exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - *YOUTUBEDL_ARGS, - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(output_path), timeout=timeout + 1) - chmod_file(output, cwd=str(out_dir)) - if result.returncode: - if (b'ERROR: Unsupported URL' in result.stderr - or b'HTTP Error 404' in result.stderr - or b'HTTP Error 403' in result.stderr - or b'URL could be a direct video link' in result.stderr - or b'Unable to extract container ID' in result.stderr): - # These happen too frequently on non-media pages to warrant printing to console - pass - else: - hints = ( - 'Got youtube-dl response code: {}.'.format(result.returncode), - *result.stderr.decode().split('\n'), - ) - raise ArchiveError('Failed to save media', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - # add video description and subtitles to full-text index - index_texts = [ - text_file.read_text(encoding='utf-8').strip() - for text_file in ( - *output_path.glob('*.description'), - *output_path.glob('*.srt'), - *output_path.glob('*.vtt'), - *output_path.glob('*.lrc'), - *output_path.glob('*.lrc'), - ) - ] - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=YOUTUBEDL_VERSION, - output=output, - status=status, - index_texts=index_texts, - **timer.stats, - ) diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py deleted file mode 100644 index e7d2036251..0000000000 --- a/archivebox/extractors/mercury.py +++ /dev/null @@ -1,114 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path - -from subprocess import CompletedProcess -from typing import Optional, List -import json - -from ..index.schema import Link, ArchiveResult, ArchiveError -from ..system import run, atomic_write -from ..util import ( - enforce_types, - is_static_file, - -) -from ..config import ( - TIMEOUT, - SAVE_MERCURY, - DEPENDENCIES, - MERCURY_VERSION, -) -from ..logging_util import TimedProgress - - - -@enforce_types -def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError: - # parse out last line of stderr - return ArchiveError( - f'Got {cmd[0]} response code: {result.returncode}).', - " ".join( - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:] - if line.strip() - ), - ) - - -@enforce_types -def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'mercury').exists(): - return False - - return SAVE_MERCURY - - -@enforce_types -def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download reader friendly version using @postlight/mercury-parser""" - - out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "mercury" - output = "mercury" - - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - output_folder.mkdir(exist_ok=True) - - # Get plain text version of article - cmd = [ - DEPENDENCIES['MERCURY_BINARY']['path'], - link.url, - "--format=text" - ] - result = run(cmd, cwd=out_dir, timeout=timeout) - try: - article_text = json.loads(result.stdout) - except json.JSONDecodeError: - raise ShellError(cmd, result) - - if article_text.get('failed'): - raise ArchiveError('Mercury was not able to get article text from the URL') - - atomic_write(str(output_folder / "content.txt"), article_text["content"]) - - # Get HTML version of article - cmd = [ - DEPENDENCIES['MERCURY_BINARY']['path'], - link.url - ] - result = run(cmd, cwd=out_dir, timeout=timeout) - try: - article_json = json.loads(result.stdout) - except json.JSONDecodeError: - raise ShellError(cmd, result) - - if article_text.get('failed'): - raise ArchiveError('Mercury was not able to get article HTML from the URL') - - atomic_write(str(output_folder / "content.html"), article_json.pop("content")) - atomic_write(str(output_folder / "article.json"), article_json) - - # Check for common failure cases - if (result.returncode > 0): - raise ShellError(cmd, result) - except (ArchiveError, Exception, OSError) as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=MERCURY_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py deleted file mode 100644 index 7138206c94..0000000000 --- a/archivebox/extractors/pdf.py +++ /dev/null @@ -1,68 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - chrome_args, -) -from ..config import ( - TIMEOUT, - SAVE_PDF, - CHROME_VERSION, -) -from ..logging_util import TimedProgress - - -@enforce_types -def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'output.pdf').exists(): - return False - - return SAVE_PDF - - -@enforce_types -def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """print PDF of site to file using chrome --headless""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'output.pdf' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--print-to-pdf', - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to save PDF', hints) - - chmod_file('output.pdf', cwd=str(out_dir)) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py deleted file mode 100644 index bc6d6656f3..0000000000 --- a/archivebox/extractors/readability.py +++ /dev/null @@ -1,135 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path -from tempfile import NamedTemporaryFile - -from typing import Optional -import json - -from ..index.schema import Link, ArchiveResult, ArchiveError -from ..system import run, atomic_write -from ..util import ( - enforce_types, - download_url, - is_static_file, - -) -from ..config import ( - TIMEOUT, - CURL_BINARY, - SAVE_READABILITY, - DEPENDENCIES, - READABILITY_VERSION, -) -from ..logging_util import TimedProgress - -@enforce_types -def get_html(link: Link, path: Path) -> str: - """ - Try to find wget, singlefile and then dom files. - If none is found, download the url again. - """ - canonical = link.canonical_outputs() - abs_path = path.absolute() - sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] - document = None - for source in sources: - try: - with open(abs_path / source, "r", encoding="utf-8") as f: - document = f.read() - break - except (FileNotFoundError, TypeError): - continue - if document is None: - return download_url(link.url) - else: - return document - -@enforce_types -def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'readability').exists(): - return False - - return SAVE_READABILITY - - -@enforce_types -def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download reader friendly version using @mozilla/readability""" - - out_dir = Path(out_dir or link.link_dir) - output_folder = out_dir.absolute() / "readability" - output = "readability" - - # Readability Docs: https://github.com/mozilla/readability - - status = 'succeeded' - # fake command to show the user so they have something to try debugging if get_html fails - cmd = [ - CURL_BINARY, - link.url - ] - readability_content = None - timer = TimedProgress(timeout, prefix=' ') - try: - document = get_html(link, out_dir) - temp_doc = NamedTemporaryFile(delete=False) - temp_doc.write(document.encode("utf-8")) - temp_doc.close() - - if not document or len(document) < 10: - raise ArchiveError('Readability could not find HTML to parse for article text') - - cmd = [ - DEPENDENCIES['READABILITY_BINARY']['path'], - temp_doc.name, - ] - - result = run(cmd, cwd=out_dir, timeout=timeout) - try: - result_json = json.loads(result.stdout) - assert result_json and 'content' in result_json - except json.JSONDecodeError: - raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) - - output_folder.mkdir(exist_ok=True) - readability_content = result_json.pop("textContent") - atomic_write(str(output_folder / "content.html"), result_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), readability_content) - atomic_write(str(output_folder / "article.json"), result_json) - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] - if line.strip() - ] - hints = ( - 'Got readability response code: {}.'.format(result.returncode), - *output_tail, - ) - - # Check for common failure cases - if (result.returncode > 0): - raise ArchiveError('Readability was not able to archive the page', hints) - except (Exception, OSError) as err: - status = 'failed' - output = err - cmd = [cmd[0], './{singlefile,dom}.html'] - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=READABILITY_VERSION, - output=output, - status=status, - index_texts=[readability_content] if readability_content else [], - **timer.stats, - ) diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py deleted file mode 100644 index cc748bf69e..0000000000 --- a/archivebox/extractors/screenshot.py +++ /dev/null @@ -1,67 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - chrome_args, -) -from ..config import ( - TIMEOUT, - SAVE_SCREENSHOT, - CHROME_VERSION, -) -from ..logging_util import TimedProgress - - - -@enforce_types -def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'screenshot.png').exists(): - return False - - return SAVE_SCREENSHOT - -@enforce_types -def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """take screenshot of site using chrome --headless""" - - out_dir = out_dir or Path(link.link_dir) - output: ArchiveOutput = 'screenshot.png' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--screenshot', - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to save screenshot', hints) - - chmod_file(output, cwd=str(out_dir)) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CHROME_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py deleted file mode 100644 index 3279960e1e..0000000000 --- a/archivebox/extractors/singlefile.py +++ /dev/null @@ -1,92 +0,0 @@ -__package__ = 'archivebox.extractors' - -from pathlib import Path - -from typing import Optional -import json - -from ..index.schema import Link, ArchiveResult, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - is_static_file, - chrome_args, -) -from ..config import ( - TIMEOUT, - SAVE_SINGLEFILE, - DEPENDENCIES, - SINGLEFILE_VERSION, - CHROME_BINARY, -) -from ..logging_util import TimedProgress - - -@enforce_types -def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - if is_static_file(link.url): - return False - - out_dir = out_dir or Path(link.link_dir) - if not overwrite and (out_dir / 'singlefile.html').exists(): - return False - - return SAVE_SINGLEFILE - - -@enforce_types -def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download full site using single-file""" - - out_dir = out_dir or Path(link.link_dir) - output = "singlefile.html" - - browser_args = chrome_args(TIMEOUT=0) - - # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli - browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], - '--browser-executable-path={}'.format(CHROME_BINARY), - browser_args, - link.url, - output, - ] - - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] - if line.strip() - ] - hints = ( - 'Got single-file response code: {}.'.format(result.returncode), - *output_tail, - ) - - # Check for common failure cases - if (result.returncode > 0) or not (out_dir / output).is_file(): - raise ArchiveError('SingleFile was not able to archive the page', hints) - chmod_file(output, cwd=str(out_dir)) - except (Exception, OSError) as err: - status = 'failed' - # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). - cmd[2] = browser_args.replace('"', "\\\"") - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=SINGLEFILE_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py deleted file mode 100644 index 272eebc8fb..0000000000 --- a/archivebox/extractors/title.py +++ /dev/null @@ -1,130 +0,0 @@ -__package__ = 'archivebox.extractors' - -import re -from html.parser import HTMLParser -from pathlib import Path -from typing import Optional - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..util import ( - enforce_types, - download_url, - htmldecode, -) -from ..config import ( - TIMEOUT, - CHECK_SSL_VALIDITY, - SAVE_TITLE, - CURL_BINARY, - CURL_ARGS, - CURL_VERSION, - CURL_USER_AGENT, -) -from ..logging_util import TimedProgress - - - -HTML_TITLE_REGEX = re.compile( - r'' # start matching text after tag - r'(.[^<>]+)', # get everything up to these symbols - re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, -) - - -class TitleParser(HTMLParser): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.title_tag = "" - self.title_og = "" - self.inside_title_tag = False - - @property - def title(self): - return self.title_tag or self.title_og or None - - def handle_starttag(self, tag, attrs): - if tag.lower() == "title" and not self.title_tag: - self.inside_title_tag = True - elif tag.lower() == "meta" and not self.title_og: - attrs = dict(attrs) - if attrs.get("property") == "og:title" and attrs.get("content"): - self.title_og = attrs.get("content") - - def handle_data(self, data): - if self.inside_title_tag and data: - self.title_tag += data.strip() - - def handle_endtag(self, tag): - if tag.lower() == "title": - self.inside_title_tag = False - - -@enforce_types -def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - # if link already has valid title, skip it - if not overwrite and link.title and not link.title.lower().startswith('http'): - return False - - return SAVE_TITLE - -def extract_title_with_regex(html): - match = re.search(HTML_TITLE_REGEX, html) - output = htmldecode(match.group(1).strip()) if match else None - return output - -@enforce_types -def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """try to guess the page's title from its content""" - - from core.models import Snapshot - - output: ArchiveOutput = None - cmd = [ - CURL_BINARY, - *CURL_ARGS, - '--max-time', str(timeout), - *(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []), - *([] if CHECK_SSL_VALIDITY else ['--insecure']), - link.url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - html = download_url(link.url, timeout=timeout) - try: - # try using relatively strict html parser first - parser = TitleParser() - parser.feed(html) - output = parser.title - if output is None: - raise - except Exception: - # fallback to regex that can handle broken/malformed html - output = extract_title_with_regex(html) - - # if title is better than the one in the db, update db with new title - if isinstance(output, str) and output: - if not link.title or len(output) >= len(link.title): - Snapshot.objects.filter(url=link.url, - timestamp=link.timestamp)\ - .update(title=output) - else: - # if no content was returned, dont save a title (because it might be a temporary error) - if not html: - raise ArchiveError('Unable to detect page title') - # output = html[:128] # use first bit of content as the title - output = link.base_url # use the filename as the title (better UX) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=CURL_VERSION, - output=output, - status=status, - **timer.stats, - ) diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py deleted file mode 100644 index d4e09aa3e8..0000000000 --- a/archivebox/extractors/wget.py +++ /dev/null @@ -1,205 +0,0 @@ -__package__ = 'archivebox.extractors' - -import re -from pathlib import Path - -from typing import Optional -from datetime import datetime, timezone - -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError -from ..system import run, chmod_file -from ..util import ( - enforce_types, - without_fragment, - without_query, - path, - domain, - urldecode, -) -from ..config import ( - WGET_ARGS, - TIMEOUT, - SAVE_WGET, - SAVE_WARC, - WGET_BINARY, - WGET_VERSION, - RESTRICT_FILE_NAMES, - CHECK_SSL_VALIDITY, - SAVE_WGET_REQUISITES, - WGET_AUTO_COMPRESSION, - WGET_USER_AGENT, - COOKIES_FILE, -) -from ..logging_util import TimedProgress - - -@enforce_types -def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - output_path = wget_output_path(link) - out_dir = out_dir or Path(link.link_dir) - if not overwrite and output_path and (out_dir / output_path).exists(): - return False - - return SAVE_WGET - - -@enforce_types -def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: - """download full site using wget""" - - out_dir = out_dir or link.link_dir - if SAVE_WARC: - warc_dir = out_dir / "warc" - warc_dir.mkdir(exist_ok=True) - warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) - - # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html - output: ArchiveOutput = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing - *WGET_ARGS, - '--timeout={}'.format(timeout), - *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), - *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), - *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), - *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), - *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []), - *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), - *([] if SAVE_WARC else ['--timestamping']), - *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), - link.url, - ] - - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, cwd=str(out_dir), timeout=timeout) - output = wget_output_path(link) - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] - if line.strip() - ] - files_downloaded = ( - int(output_tail[-1].strip().split(' ', 2)[1] or 0) - if 'Downloaded:' in output_tail[-1] - else 0 - ) - hints = ( - 'Got wget response code: {}.'.format(result.returncode), - *output_tail, - ) - - # Check for common failure cases - if (result.returncode > 0 and files_downloaded < 1) or output is None: - if b'403: Forbidden' in result.stderr: - raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) - if b'404: Not Found' in result.stderr: - raise ArchiveError('404 Not Found', hints) - if b'ERROR 500: Internal Server Error' in result.stderr: - raise ArchiveError('500 Internal Server Error', hints) - raise ArchiveError('Wget failed or got an error from the server', hints) - - if (out_dir / output).exists(): - chmod_file(output, cwd=str(out_dir)) - else: - print(f' {out_dir}/{output}') - raise ArchiveError('Failed to find wget output after running', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return ArchiveResult( - cmd=cmd, - pwd=str(out_dir), - cmd_version=WGET_VERSION, - output=output, - status=status, - **timer.stats, - ) - - -@enforce_types -def wget_output_path(link: Link) -> Optional[str]: - """calculate the path to the wgetted .html file, since wget may - adjust some paths to be different than the base_url path. - - See docs on wget --adjust-extension (-E) - """ - - # Wget downloads can save in a number of different ways depending on the url: - # https://example.com - # > example.com/index.html - # https://example.com?v=zzVa_tX1OiI - # > example.com/index.html?v=zzVa_tX1OiI.html - # https://www.example.com/?v=zzVa_tX1OiI - # > example.com/index.html?v=zzVa_tX1OiI.html - - # https://example.com/abc - # > example.com/abc.html - # https://example.com/abc/ - # > example.com/abc/index.html - # https://example.com/abc?v=zzVa_tX1OiI.html - # > example.com/abc?v=zzVa_tX1OiI.html - # https://example.com/abc/?v=zzVa_tX1OiI.html - # > example.com/abc/index.html?v=zzVa_tX1OiI.html - - # https://example.com/abc/test.html - # > example.com/abc/test.html - # https://example.com/abc/test?v=zzVa_tX1OiI - # > example.com/abc/test?v=zzVa_tX1OiI.html - # https://example.com/abc/test/?v=zzVa_tX1OiI - # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html - - # There's also lots of complexity around how the urlencoding and renaming - # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc - - # Since the wget algorithm for -E (appending .html) is incredibly complex - # and there's no way to get the computed output path from wget - # in order to avoid having to reverse-engineer how they calculate it, - # we just look in the output folder read the filename wget used from the filesystem - full_path = without_fragment(without_query(path(link.url))).strip('/') - search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) - for _ in range(4): - if search_dir.exists(): - if search_dir.is_dir(): - html_files = [ - f for f in search_dir.iterdir() - if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) - ] - if html_files: - return str(html_files[0].relative_to(link.link_dir)) - - # sometimes wget'd URLs have no ext and return non-html - # e.g. /some/example/rss/all -> some RSS XML content) - # /some/other/url.o4g -> some binary unrecognized ext) - # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all - last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) - for file_present in search_dir.iterdir(): - if file_present == last_part_of_url: - return str((search_dir / file_present).relative_to(link.link_dir)) - - # Move up one directory level - search_dir = search_dir.parent - - if str(search_dir) == link.link_dir: - break - - # check for literally any file present that isnt an empty folder - domain_dir = Path(domain(link.url).replace(":", "+")) - files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*')) - if files_within: - return str((domain_dir / files_within[-1]).relative_to(link.link_dir)) - - # fallback to just the domain dir - search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") - if search_dir.is_dir(): - return domain(link.url).replace(":", "+") - - return None diff --git a/archivebox/hooks.py b/archivebox/hooks.py new file mode 100644 index 0000000000..56a33b9b71 --- /dev/null +++ b/archivebox/hooks.py @@ -0,0 +1,1158 @@ +""" +Hook discovery and execution helpers for ArchiveBox plugins. + +ArchiveBox no longer drives plugin execution itself during normal crawls. +`abx-dl` owns the live runtime and emits typed bus events; ArchiveBox mainly: + +- discovers hook files for inspection / docs / legacy direct execution helpers +- executes individual hook scripts when explicitly requested +- parses hook stdout JSONL records into ArchiveBox models when needed + +Hook-backed event families are discovered from filenames like: + on_BinaryRequest__* + on_CrawlSetup__* + on_Snapshot__* + +Internal bus event names are normalized to the corresponding +`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist +for that prefix, discovery returns `[]`. + +Directory structure: + abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package) + data/custom_plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user) + +Hook contract: + Input: --url=<url> (and other --key=value args) + Output: JSONL records to stdout, files to $PWD + Exit: 0 = success, non-zero = failure + +Execution order: + - Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename + - Foreground hooks run sequentially in that order + - Background hooks (.bg suffix) run concurrently and do not block foreground progress + - After all foreground hooks complete, background hooks receive SIGTERM and must finalize + +Hook naming convention: + on_{EventFamily}__{run_order}_{description}[.finite.bg|.daemon.bg].{ext} + +API: + discover_hooks(event) -> List[Path] Find hook scripts for a hook-backed event family + run_hook(script, ...) -> Process Execute a hook script directly + is_background_hook(name) -> bool Check if hook is background (.bg suffix) +""" + +__package__ = "archivebox" + +import os +import json +from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional, TypedDict + +from abx_plugins import get_plugins_dir +from django.conf import settings +from django.utils.safestring import mark_safe +from archivebox.config.constants import CONSTANTS +from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + +if TYPE_CHECKING: + from archivebox.machine.models import Process + + +# Plugin directories +BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() +USER_PLUGINS_DIR = Path( + os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or getattr(settings, "USER_PLUGINS_DIR", "") or str(CONSTANTS.USER_PLUGINS_DIR), +).expanduser() + + +# ============================================================================= +# Hook Step Extraction +# ============================================================================= + + +def is_background_hook(hook_name: str) -> bool: + """ + Check if a hook is a background hook (doesn't block foreground progression). + + Background hooks have '.bg.' in their filename before the extension. + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.daemon.bg.js') + + Returns: + True if background hook, False if foreground. + + Examples: + is_background_hook('on_Snapshot__10_chrome_tab.daemon.bg.js') -> True + is_background_hook('on_Snapshot__50_wget.py') -> False + is_background_hook('on_Snapshot__63_media.finite.bg.py') -> True + """ + return ".bg." in hook_name or "__background" in hook_name + + +def is_finite_background_hook(hook_name: str) -> bool: + """Check if a background hook is finite-lived and should be awaited.""" + return ".finite.bg." in hook_name + + +def iter_plugin_dirs() -> list[Path]: + """Iterate over all built-in and user plugin directories.""" + plugin_dirs: list[Path] = [] + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): + plugin_dirs.append(plugin_dir) + + return plugin_dirs + + +def normalize_hook_event_name(event_name: str) -> str | None: + """ + Normalize a hook event family or event class name to its on_* prefix. + + Examples: + BinaryRequestEvent -> BinaryRequest + CrawlSetupEvent -> CrawlSetup + SnapshotEvent -> Snapshot + BinaryEvent -> Binary + CrawlCleanupEvent -> CrawlCleanup + """ + normalized = str(event_name or "").strip() + if not normalized: + return None + + if normalized.endswith("Event"): + return normalized[:-5] or None + return normalized + + +class HookResult(TypedDict, total=False): + """Raw result from run_hook().""" + + returncode: int + stdout: str + stderr: str + output_json: dict[str, Any] | None + output_files: list[dict[str, Any]] + duration_ms: int + hook: str + plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot') + hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py') + # New fields for JSONL parsing + records: list[dict[str, Any]] # Parsed JSONL records with 'type' field + + +def discover_hooks( + event_name: str, + filter_disabled: bool = True, + config: dict[str, Any] | None = None, +) -> list[Path]: + """ + Find all hook scripts for an event family. + + Searches both built-in and user plugin directories. + Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags). + Returns scripts sorted alphabetically by filename for deterministic execution order. + + Hook naming convention uses numeric prefixes to control order: + on_Snapshot__10_title.py # runs first + on_Snapshot__15_singlefile.py # runs second + on_Snapshot__26_readability.py # runs later (depends on singlefile) + + Args: + event_name: Hook event family or event class name. + Examples: 'BinaryRequestEvent', 'Snapshot'. + Event names are normalized by stripping a trailing `Event`. + If no matching `on_{EventFamily}__*` scripts exist, returns []. + filter_disabled: If True, skip hooks from disabled plugins (default: True) + config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot) + If None, will call get_config() with global scope + + Returns: + Sorted list of hook script paths from enabled plugins only. + + Examples: + # With proper config context (recommended): + from archivebox.config.configset import get_config + config = get_config(crawl=my_crawl, snapshot=my_snapshot) + discover_hooks('Snapshot', config=config) + # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False) + + # Without config (uses global defaults): + discover_hooks('Snapshot') + # Returns: [Path('.../on_Snapshot__10_title.py'), ...] + + # Show all plugins regardless of enabled status: + discover_hooks('Snapshot', filter_disabled=False) + # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')] + """ + hook_event_name = normalize_hook_event_name(event_name) + if not hook_event_name: + return [] + + hooks = [] + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + # Search for hook scripts in all subdirectories + for ext in ("sh", "py", "js"): + pattern = f"*/on_{hook_event_name}__*.{ext}" + hooks.extend(base_dir.glob(pattern)) + + # Also check for hooks directly in the plugins directory + pattern_direct = f"on_{hook_event_name}__*.{ext}" + hooks.extend(base_dir.glob(pattern_direct)) + + # Binary provider hooks are not end-user extractors. They + # self-filter via `binproviders`, so applying the PLUGINS whitelist here + # can hide the very installer needed by a selected plugin (e.g. + # `--plugins=singlefile` still needs the `npm` BinaryRequest hook). + if filter_disabled and hook_event_name != "BinaryRequest": + # Get merged config if not provided (lazy import to avoid circular dependency) + if config is None: + from archivebox.config.configset import get_config + + config = get_config() + + enabled_hooks = [] + + for hook in hooks: + # Get plugin name from parent directory + # e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget' + plugin_name = hook.parent.name + + # Check if this is a plugin directory (not the root plugins dir) + if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()): + # Hook is in root plugins directory, not a plugin subdir + # Include it by default (no filtering for non-plugin hooks) + enabled_hooks.append(hook) + continue + + # Check if plugin is enabled + plugin_config = get_plugin_special_config(plugin_name, config) + if plugin_config["enabled"]: + enabled_hooks.append(hook) + + hooks = enabled_hooks + + # Sort by filename (not full path) to ensure numeric prefix ordering works + # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py + return sorted(set(hooks), key=lambda p: p.name) + + +def run_hook( + script: Path, + output_dir: Path, + config: dict[str, Any], + timeout: int | None = None, + parent: Optional["Process"] = None, + **kwargs: Any, +) -> "Process": + """ + Execute a hook script with the given arguments using Process model. + + This is the low-level hook executor that creates a Process record and + uses Process.launch() for subprocess management. + + Config is passed to hooks via environment variables. Caller MUST use + get_config() to merge all sources (file, env, machine, crawl, snapshot). + + Args: + script: Path to the hook script (.sh, .py, or .js) + output_dir: Working directory for the script (where output files go) + config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED + timeout: Maximum execution time in seconds + If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300) + parent: Optional parent Process (for tracking worker->hook hierarchy) + **kwargs: Arguments passed to the script as --key=value + + Returns: + Process model instance (use process.exit_code, process.stdout, process.get_records()) + + Example: + from archivebox.config.configset import get_config + config = get_config(crawl=my_crawl, snapshot=my_snapshot) + process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id) + if process.status == 'exited': + records = process.get_records() # Get parsed JSONL output + """ + from archivebox.machine.models import Process, Machine, NetworkInterface + from archivebox.config.constants import CONSTANTS + import sys + + # Auto-detect timeout from plugin config if not explicitly provided + if timeout is None: + plugin_name = script.parent.name + plugin_config = get_plugin_special_config(plugin_name, config) + timeout = plugin_config["timeout"] + if timeout: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) + + # Get current machine + machine = Machine.current() + iface = NetworkInterface.current(refresh=True) + machine = iface.machine + + # Auto-detect parent process if not explicitly provided + # This enables automatic hierarchy tracking: Worker -> Hook + if parent is None: + try: + parent = Process.current() + except Exception: + # If Process.current() fails (e.g., not in a worker context), leave parent as None + pass + + if not script.exists(): + # Create a failed Process record for hooks that don't exist + process = Process.objects.create( + machine=machine, + iface=iface, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=["echo", f"Hook script not found: {script}"], + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=1, + stderr=f"Hook script not found: {script}", + ) + return process + + # Determine the interpreter based on file extension + ext = script.suffix.lower() + if ext == ".sh": + cmd = ["bash", str(script)] + elif ext == ".py": + cmd = [sys.executable, str(script)] + elif ext == ".js": + cmd = ["node", str(script)] + else: + # Try to execute directly (assumes shebang) + cmd = [str(script)] + + # Build CLI arguments from kwargs + for key, value in kwargs.items(): + # Skip keys that start with underscore (internal parameters) + if key.startswith("_"): + continue + + arg_key = f"--{key.replace('_', '-')}" + if isinstance(value, bool): + if value: + cmd.append(arg_key) + elif value is not None and value != "": + # JSON-encode complex values, use str for simple ones + # Skip empty strings to avoid --key= which breaks argument parsers + if isinstance(value, (dict, list)): + cmd.append(f"{arg_key}={json.dumps(value)}") + else: + # Ensure value is converted to string and strip whitespace + str_value = str(value).strip() + if str_value: # Only add if non-empty after stripping + cmd.append(f"{arg_key}={str_value}") + + # Set up environment with base paths + env = os.environ.copy() + env["DATA_DIR"] = str(getattr(settings, "DATA_DIR", Path.cwd())) + env["ARCHIVE_DIR"] = str(getattr(settings, "ARCHIVE_DIR", Path.cwd() / "archive")) + env["ABX_RUNTIME"] = "archivebox" + env.setdefault("MACHINE_ID", getattr(settings, "MACHINE_ID", "") or os.environ.get("MACHINE_ID", "")) + + resolved_output_dir = output_dir.resolve() + output_parts = set(resolved_output_dir.parts) + if "snapshots" in output_parts: + env["SNAP_DIR"] = str(resolved_output_dir.parent) + if "crawls" in output_parts: + env["CRAWL_DIR"] = str(resolved_output_dir.parent) + + crawl_id = kwargs.get("_crawl_id") or kwargs.get("crawl_id") + if crawl_id: + try: + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.filter(id=crawl_id).first() + if crawl: + env["CRAWL_DIR"] = str(crawl.output_dir) + except Exception: + pass + + # Get LIB_DIR and LIB_BIN_DIR from config + lib_dir = config.get("LIB_DIR", getattr(settings, "LIB_DIR", None)) + lib_bin_dir = config.get("LIB_BIN_DIR", getattr(settings, "LIB_BIN_DIR", None)) + if lib_dir: + env["LIB_DIR"] = str(lib_dir) + if not lib_bin_dir and lib_dir: + # Derive LIB_BIN_DIR from LIB_DIR if not set + lib_bin_dir = Path(lib_dir) / "bin" + + # Set NODE_PATH for Node.js module resolution. + # Priority: config dict > derive from LIB_DIR + node_path = config.get("NODE_PATH") + if not node_path and lib_dir: + # Derive from LIB_DIR/npm/node_modules (create if needed) + node_modules_dir = Path(lib_dir) / "npm" / "node_modules" + node_modules_dir.mkdir(parents=True, exist_ok=True) + node_path = str(node_modules_dir) + if node_path: + env["NODE_PATH"] = node_path + env["NODE_MODULES_DIR"] = node_path # For backwards compatibility + + # Export all config values to environment (already merged by get_config()) + # Skip keys we've already handled specially above (PATH, LIB_DIR, LIB_BIN_DIR, NODE_PATH, etc.) + SKIP_KEYS = { + "PATH", + "LIB_DIR", + "LIB_BIN_DIR", + "NODE_PATH", + "NODE_MODULES_DIR", + "DATA_DIR", + "ARCHIVE_DIR", + "MACHINE_ID", + "SNAP_DIR", + "CRAWL_DIR", + } + for key, value in config.items(): + if key in SKIP_KEYS: + continue # Already handled specially above, don't overwrite + if value is None: + continue + elif isinstance(value, bool): + env[key] = "true" if value else "false" + elif isinstance(value, (list, dict)): + env[key] = json.dumps(value) + else: + env[key] = str(value) + + # Build PATH with proper precedence: + # 1. path-like *_BINARY parents (explicit binary overrides / cached abspaths) + # 2. LIB_BIN_DIR (local symlinked binaries) + # 3. existing PATH + runtime_bin_dirs: list[str] = [] + if lib_bin_dir: + lib_bin_dir = str(lib_bin_dir) + env["LIB_BIN_DIR"] = lib_bin_dir + for key, raw_value in env.items(): + if not key.endswith("_BINARY"): + continue + value = str(raw_value or "").strip() + if not value: + continue + path_value = Path(value).expanduser() + if not (path_value.is_absolute() or "/" in value or "\\" in value): + continue + binary_dir = str(path_value.resolve(strict=False).parent) + if binary_dir and binary_dir not in runtime_bin_dirs: + runtime_bin_dirs.append(binary_dir) + if lib_bin_dir and lib_bin_dir not in runtime_bin_dirs: + runtime_bin_dirs.append(lib_bin_dir) + uv_value = str(env.get("UV") or "").strip() + if uv_value: + uv_bin_dir = str(Path(uv_value).expanduser().resolve(strict=False).parent) + if uv_bin_dir and uv_bin_dir not in runtime_bin_dirs: + runtime_bin_dirs.append(uv_bin_dir) + + current_path = env.get("PATH", "") + path_parts = [part for part in current_path.split(os.pathsep) if part] + for extra_dir in reversed(runtime_bin_dirs): + if extra_dir not in path_parts: + path_parts.insert(0, extra_dir) + env["PATH"] = os.pathsep.join(path_parts) + + # Create output directory if needed + output_dir.mkdir(parents=True, exist_ok=True) + + # Detect if this is a background hook (long-running daemon) + # Background hooks use the .daemon.bg. or .finite.bg. filename convention. + # Old convention: __background in stem (for backwards compatibility) + is_background = ".bg." in script.name or "__background" in script.stem + + try: + # Create Process record + process = Process.objects.create( + machine=machine, + iface=iface, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, + ) + + # Copy the env dict we already built (includes os.environ + all customizations) + process.env = env.copy() + process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script)) + + # Save env before launching + process.save() + + # Launch subprocess using Process.launch() + process.launch(background=is_background) + + # Return Process object (caller can use process.exit_code, process.stdout, process.get_records()) + return process + + except Exception as e: + # Create a failed Process record for exceptions + process = Process.objects.create( + machine=machine, + iface=iface, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=1, + stderr=f"Failed to run hook: {type(e).__name__}: {e}", + ) + return process + + +def extract_records_from_process(process: "Process") -> list[dict[str, Any]]: + """ + Extract JSONL records from a Process's stdout. + + Adds plugin metadata to each record. + + Args: + process: Process model instance with stdout captured + + Returns: + List of parsed JSONL records with plugin metadata + """ + records = process.get_records() + if not records: + return [] + + # Extract plugin metadata from process.pwd and process.cmd + plugin_name = Path(process.pwd).name if process.pwd else "unknown" + hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else "unknown" + plugin_hook = process.cmd[1] if len(process.cmd) > 1 else "" + + for record in records: + # Add plugin metadata to record + record.setdefault("plugin", plugin_name) + record.setdefault("hook_name", hook_name) + record.setdefault("plugin_hook", plugin_hook) + + return records + + +def collect_urls_from_plugins(snapshot_dir: Path) -> list[dict[str, Any]]: + """ + Collect all urls.jsonl entries from parser plugin output subdirectories. + + Each parser plugin outputs urls.jsonl to its own subdir: + snapshot_dir/parse_rss_urls/urls.jsonl + snapshot_dir/parse_html_urls/urls.jsonl + etc. + + This is not special handling - urls.jsonl is just a normal output file. + This utility collects them all for the crawl system. + """ + urls = [] + + # Look in each immediate subdirectory for urls.jsonl + if not snapshot_dir.exists(): + return urls + + for subdir in snapshot_dir.iterdir(): + if not subdir.is_dir(): + continue + + urls_file = subdir / "urls.jsonl" + if not urls_file.exists(): + continue + + try: + from archivebox.machine.models import Process + + text = urls_file.read_text() + for entry in Process.parse_records_from_text(text): + if entry.get("url"): + entry["url"] = sanitize_extracted_url(fix_url_from_markdown(str(entry["url"]).strip())) + if not entry["url"]: + continue + # Track which parser plugin found this URL + entry["plugin"] = subdir.name + urls.append(entry) + except Exception: + pass + + return urls + + +@lru_cache(maxsize=1) +def get_plugins() -> list[str]: + """ + Get list of available plugins by discovering plugin directories. + + Returns plugin directory names for any plugin that exposes hooks, config.json, + or a standardized templates/icon.html asset. This includes non-extractor + plugins such as binary providers and shared base plugins. + """ + plugins = [] + + for plugin_dir in iter_plugin_dirs(): + has_hooks = any(plugin_dir.glob("on_*__*.*")) + has_config = (plugin_dir / "config.json").exists() + has_icon = (plugin_dir / "templates" / "icon.html").exists() + if has_hooks or has_config or has_icon: + plugins.append(plugin_dir.name) + + return sorted(set(plugins)) + + +def get_plugin_name(plugin: str) -> str: + """ + Get the base plugin name without numeric prefix. + + Examples: + '10_title' -> 'title' + '26_readability' -> 'readability' + '50_parse_html_urls' -> 'parse_html_urls' + """ + # Split on first underscore after any leading digits + parts = plugin.split("_", 1) + if len(parts) == 2 and parts[0].isdigit(): + return parts[1] + return plugin + + +def get_enabled_plugins(config: dict[str, Any] | None = None) -> list[str]: + """ + Get the list of enabled plugins based on config and available hooks. + + Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled. + + Args: + config: Merged config dict from get_config() - if None, uses global config + + Returns: + Plugin names sorted alphabetically (numeric prefix controls order). + + Example: + from archivebox.config.configset import get_config + config = get_config(crawl=my_crawl, snapshot=my_snapshot) + enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...] + """ + # Get merged config if not provided + if config is None: + from archivebox.config.configset import get_config + + config = get_config() + + def normalize_enabled_plugins(value: Any) -> list[str]: + if value is None: + return [] + if isinstance(value, str): + raw = value.strip() + if not raw: + return [] + if raw.startswith("["): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + return [str(plugin).strip() for plugin in parsed if str(plugin).strip()] + return [plugin.strip() for plugin in raw.split(",") if plugin.strip()] + if isinstance(value, (list, tuple, set)): + return [str(plugin).strip() for plugin in value if str(plugin).strip()] + return [str(value).strip()] if str(value).strip() else [] + + # Support explicit ENABLED_PLUGINS override (legacy) + if "ENABLED_PLUGINS" in config: + return normalize_enabled_plugins(config["ENABLED_PLUGINS"]) + if "ENABLED_EXTRACTORS" in config: + return normalize_enabled_plugins(config["ENABLED_EXTRACTORS"]) + + # Filter all plugins by enabled status + all_plugins = get_plugins() + enabled = [] + + for plugin in all_plugins: + plugin_config = get_plugin_special_config(plugin, config) + if plugin_config["enabled"]: + enabled.append(plugin) + + return enabled + + +def discover_plugins_that_provide_interface( + module_name: str, + required_attrs: list[str], + plugin_prefix: str | None = None, +) -> dict[str, Any]: + """ + Discover plugins that provide a specific Python module with required interface. + + This enables dynamic plugin discovery for features like search backends, + storage backends, etc. without hardcoding imports. + + Args: + module_name: Name of the module to look for (e.g., 'search') + required_attrs: List of attributes the module must have (e.g., ['search', 'flush']) + plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_') + + Returns: + Dict mapping backend names to imported modules. + Backend name is derived from plugin directory name minus the prefix. + e.g., search_backend_sqlite -> 'sqlite' + + Example: + backends = discover_plugins_that_provide_interface( + module_name='search', + required_attrs=['search', 'flush'], + plugin_prefix='search_backend_', + ) + # Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>} + """ + import importlib.util + + backends = {} + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if not plugin_dir.is_dir(): + continue + + plugin_name = plugin_dir.name + + # Filter by prefix if specified + if plugin_prefix and not plugin_name.startswith(plugin_prefix): + continue + + # Look for the module file + module_path = plugin_dir / f"{module_name}.py" + if not module_path.exists(): + continue + + try: + # Import the module dynamically + spec = importlib.util.spec_from_file_location( + f"archivebox.dynamic_plugins.{plugin_name}.{module_name}", + module_path, + ) + if spec is None or spec.loader is None: + continue + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Check for required attributes + if not all(hasattr(module, attr) for attr in required_attrs): + continue + + # Derive backend name from plugin directory name + if plugin_prefix: + backend_name = plugin_name[len(plugin_prefix) :] + else: + backend_name = plugin_name + + backends[backend_name] = module + + except Exception: + # Skip plugins that fail to import + continue + + return backends + + +def get_search_backends() -> dict[str, Any]: + """ + Discover all available search backend plugins. + + Search backends must provide a search.py module with: + - search(query: str) -> List[str] (returns snapshot IDs) + - flush(snapshot_ids: Iterable[str]) -> None + + Returns: + Dict mapping backend names to their modules. + e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>} + """ + return discover_plugins_that_provide_interface( + module_name="search", + required_attrs=["search", "flush"], + plugin_prefix="search_backend_", + ) + + +def discover_plugin_configs() -> dict[str, dict[str, Any]]: + """ + Discover all plugin config.json schemas. + + Each plugin can define a config.json file with JSONSchema defining + its configuration options. This function discovers and loads all such schemas. + + The config.json files use JSONSchema draft-07 with custom extensions: + - x-fallback: Global config key to use as fallback + - x-aliases: List of old/alternative config key names + + Returns: + Dict mapping plugin names to their parsed JSONSchema configs. + e.g., {'wget': {...schema...}, 'chrome': {...schema...}} + + Example config.json: + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "SAVE_WGET": {"type": "boolean", "default": true}, + "WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"} + } + } + """ + configs = {} + + for plugin_dir in iter_plugin_dirs(): + config_path = plugin_dir / "config.json" + if not config_path.exists(): + continue + + try: + with open(config_path) as f: + schema = json.load(f) + + # Basic validation: must be an object with properties + if not isinstance(schema, dict): + continue + if schema.get("type") != "object": + continue + if "properties" not in schema: + continue + + configs[plugin_dir.name] = schema + + except (json.JSONDecodeError, OSError) as e: + # Log warning but continue - malformed config shouldn't break discovery + import sys + + print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr) + continue + + return configs + + +def get_config_defaults_from_plugins() -> dict[str, Any]: + """ + Get default values for all plugin config options. + + Returns: + Dict mapping config keys to their default values. + e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...} + """ + plugin_configs = discover_plugin_configs() + defaults = {} + + for plugin_name, schema in plugin_configs.items(): + properties = schema.get("properties", {}) + for key, prop_schema in properties.items(): + if "default" in prop_schema: + defaults[key] = prop_schema["default"] + + return defaults + + +def get_plugin_special_config(plugin_name: str, config: dict[str, Any], _visited: set[str] | None = None) -> dict[str, Any]: + """ + Extract special config keys for a plugin following naming conventions. + + ArchiveBox recognizes 3 special config key patterns per plugin: + - {PLUGIN}_ENABLED: Enable/disable toggle (default True) + - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300) + - {PLUGIN}_BINARY: Primary binary path (default to plugin_name) + + These allow ArchiveBox to: + - Skip disabled plugins (optimization) + - Enforce plugin-specific timeouts automatically + - Discover plugin binaries for validation + + Args: + plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome') + config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot) + + Returns: + Dict with standardized keys: + { + 'enabled': True, # bool + 'timeout': 60, # int, seconds + 'binary': 'wget', # str, path or name + } + + Examples: + >>> from archivebox.config.configset import get_config + >>> config = get_config(crawl=my_crawl, snapshot=my_snapshot) + >>> get_plugin_special_config('wget', config) + {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'} + """ + plugin_upper = plugin_name.upper() + + # 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True) + # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases + + # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon) + plugins_whitelist = config.get("PLUGINS", "") + if plugins_whitelist: + # PLUGINS whitelist is specified - include transitive required_plugins from + # config.json so selecting a plugin also enables its declared plugin-level + # dependencies (e.g. singlefile -> chrome). + plugin_configs = discover_plugin_configs() + plugin_names = {p.strip().lower() for p in plugins_whitelist.split(",") if p.strip()} + pending = list(plugin_names) + + while pending: + current = pending.pop() + schema = plugin_configs.get(current, {}) + required_plugins = schema.get("required_plugins", []) + if not isinstance(required_plugins, list): + continue + + for required_plugin in required_plugins: + required_plugin_name = str(required_plugin).strip().lower() + if not required_plugin_name or required_plugin_name in plugin_names: + continue + plugin_names.add(required_plugin_name) + pending.append(required_plugin_name) + + if plugin_name.lower() not in plugin_names: + # Plugin not in whitelist - explicitly disabled + enabled = False + else: + # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED + enabled_key = f"{plugin_upper}_ENABLED" + enabled = config.get(enabled_key) + if enabled is None: + enabled = True # Default to enabled if in whitelist + elif isinstance(enabled, str): + enabled = enabled.lower() not in ("false", "0", "no", "") + else: + # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) + enabled_key = f"{plugin_upper}_ENABLED" + enabled = config.get(enabled_key) + if enabled is None: + enabled = True + elif isinstance(enabled, str): + # Handle string values from config file ("true"/"false") + enabled = enabled.lower() not in ("false", "0", "no", "") + + plugin_configs = discover_plugin_configs() + plugin_name_lower = plugin_name.lower() + + if enabled: + visited = _visited or set() + if plugin_name_lower not in visited: + next_visited = visited | {plugin_name_lower} + schema = plugin_configs.get(plugin_name_lower, {}) + required_plugins = schema.get("required_plugins", []) + if isinstance(required_plugins, list): + for required_plugin in required_plugins: + required_plugin_name = str(required_plugin).strip() + if not required_plugin_name: + continue + required_config = get_plugin_special_config(required_plugin_name, config, _visited=next_visited) + if not required_config["enabled"]: + enabled = False + break + + # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300) + timeout_key = f"{plugin_upper}_TIMEOUT" + timeout = config.get(timeout_key) or config.get("TIMEOUT", 300) + + # 3. Binary: PLUGINNAME_BINARY (default to plugin_name) + binary_key = f"{plugin_upper}_BINARY" + binary = config.get(binary_key, plugin_name) + + return { + "enabled": bool(enabled), + "timeout": int(timeout), + "binary": str(binary), + } + + +# ============================================================================= +# Plugin Template Discovery +# ============================================================================= +# +# Plugins can provide custom templates for rendering their output in the UI. +# Templates are discovered by filename convention inside each plugin's templates/ dir: +# +# abx_plugins/plugins/<plugin_name>/ +# templates/ +# icon.html # Icon for admin table view (small inline HTML) +# card.html # Preview card for snapshot header +# full.html # Fullscreen view template +# +# Template context variables available: +# {{ result }} - ArchiveResult object +# {{ snapshot }} - Parent Snapshot object +# {{ output_path }} - Path to output file/dir relative to snapshot dir +# {{ plugin }} - Plugin name (e.g., 'screenshot', 'singlefile') +# + +# Default templates used when plugin doesn't provide one +DEFAULT_TEMPLATES = { + "icon": """ + <span title="{{ plugin }}" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;"> + {{ icon }} + </span> + """, + "card": """ + <iframe src="{{ output_path }}" + class="card-img-top" + style="width: 100%; height: 100%; border: none;" + sandbox="allow-same-origin allow-scripts allow-forms" + loading="lazy"> + </iframe> + """, + "full": """ + <iframe src="{{ output_path }}" + class="full-page-iframe" + style="width: 100%; height: 100vh; border: none;" + sandbox="allow-same-origin allow-scripts allow-forms"> + </iframe> + """, +} + + +def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> str | None: + """ + Get a plugin template by plugin name and template type. + + Args: + plugin: Plugin name (e.g., 'screenshot', '15_singlefile') + template_name: One of 'icon', 'card', 'full' + fallback: If True, return default template if plugin template not found + + Returns: + Template content as string, or None if not found and fallback=False. + """ + base_name = get_plugin_name(plugin) + if base_name in ("yt-dlp", "youtube-dl"): + base_name = "ytdlp" + + for plugin_dir in iter_plugin_dirs(): + # Match by directory name (exact or partial) + if plugin_dir.name == base_name or plugin_dir.name.endswith(f"_{base_name}"): + template_path = plugin_dir / "templates" / f"{template_name}.html" + if template_path.exists(): + return template_path.read_text() + + # Fall back to default template if requested + if fallback: + return DEFAULT_TEMPLATES.get(template_name, "") + + return None + + +@lru_cache(maxsize=None) +def get_plugin_icon(plugin: str) -> str: + """ + Get the icon for a plugin from its icon.html template. + + Args: + plugin: Plugin name (e.g., 'screenshot', '15_singlefile') + + Returns: + Icon HTML/emoji string. + """ + # Try plugin-provided icon template + icon_template = get_plugin_template(plugin, "icon", fallback=False) + if icon_template: + return mark_safe(icon_template.strip()) + + # Fall back to generic folder icon + return mark_safe("📁") + + +# ============================================================================= +# Hook Result Processing Helpers +# ============================================================================= + + +def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any] | None = None) -> dict[str, int]: + """ + Process JSONL records emitted by hook stdout. + + This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest, + and Binary. It does not process internal bus lifecycle events, since those + are not emitted as JSONL records by hook subprocesses. + + Args: + records: List of JSONL record dicts from result['records'] + overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc. + + Returns: + Dict with counts by record type + """ + stats = {} + overrides = overrides or {} + + for record in records: + record_type = record.get("type") + if not record_type: + continue + + # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones) + if record_type == "ArchiveResult": + continue + + try: + # Dispatch to appropriate model's from_json() method + if record_type == "Snapshot": + from archivebox.core.models import Snapshot + + if record.get("url"): + record = { + **record, + "url": sanitize_extracted_url(fix_url_from_markdown(str(record["url"]).strip())), + } + if not record["url"]: + continue + + # Check if discovered snapshot exceeds crawl max_depth + snapshot_depth = record.get("depth", 0) + crawl = overrides.get("crawl") + if crawl and snapshot_depth > crawl.max_depth: + # Skip - this URL was discovered but exceeds max crawl depth + continue + + obj = Snapshot.from_json(record.copy(), overrides) + if obj: + stats["Snapshot"] = stats.get("Snapshot", 0) + 1 + + elif record_type == "Tag": + from archivebox.core.models import Tag + + obj = Tag.from_json(record.copy(), overrides) + if obj: + stats["Tag"] = stats.get("Tag", 0) + 1 + + elif record_type in {"BinaryRequest", "Binary"}: + from archivebox.machine.models import Binary + + obj = Binary.from_json(record.copy(), overrides) + if obj: + stats[record_type] = stats.get(record_type, 0) + 1 + + else: + import sys + + print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) + + except Exception as e: + import sys + + print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr) + continue + + return stats diff --git a/archivebox/ideas/__init__.py b/archivebox/ideas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/ideas/process_plugin.py b/archivebox/ideas/process_plugin.py new file mode 100644 index 0000000000..aad584bbaf --- /dev/null +++ b/archivebox/ideas/process_plugin.py @@ -0,0 +1,325 @@ +__package__ = "archivebox.ideas" + +import asyncio +import importlib +import json +import os +import shlex +import signal +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional +from collections.abc import Callable, Mapping, MutableMapping + +from pydantic import BaseModel, Field + +try: + bubus = importlib.import_module("bubus") + BaseEvent = bubus.BaseEvent + EventBus = bubus.EventBus +except Exception as exc: # pragma: no cover - optional dependency + raise ImportError("ProcessPlugin requires bubus to be installed") from exc + +try: + uuid7str = importlib.import_module("bubus.service").uuid7str +except Exception: # pragma: no cover - optional dependency + from uuid import uuid4 as _uuid4 + + def uuid7str() -> str: + return str(_uuid4()) + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +class ProcessRecord(BaseModel): + id: str = Field(default_factory=uuid7str) + cmd: list[str] + cwd: str | None = None + env: dict[str, str] = Field(default_factory=dict) + pid: int | None = None + started_at: datetime | None = None + ended_at: datetime | None = None + exit_code: int | None = None + stdout_path: str | None = None + stderr_path: str | None = None + cmd_path: str | None = None + pid_path: str | None = None + is_background: bool = False + parent_process_id: str | None = None + + +class ProcessLaunch(BaseEvent[ProcessRecord]): + cmd: list[str] + cwd: str | None = None + env: dict[str, str] | None = None + timeout: float | None = None + output_dir: str | None = None + log_prefix: str | None = None + is_background: bool = False + parent_process_id: str | None = None + parse_stdout_events: bool = True + + +class ProcessStarted(BaseEvent[None]): + process: ProcessRecord + + +class ProcessExited(BaseEvent[None]): + process: ProcessRecord + + +class ProcessKill(BaseEvent[ProcessRecord]): + process_id: str + signal: int = signal.SIGTERM + timeout: float | None = 10.0 + + +@dataclass +class _RunningProcess: + process: asyncio.subprocess.Process + record: ProcessRecord + stdout_task: asyncio.Task[None] | None + stderr_task: asyncio.Task[None] | None + watcher_task: asyncio.Task[None] | None + parent_event_id: str | None + + +JsonEventAdapter = Callable[[dict[str, Any], str | None], Optional[BaseEvent[Any]]] + + +class ProcessPlugin: + """Spawn and monitor processes using events (no Django required).""" + + def __init__( + self, + bus: EventBus, + *, + env: Mapping[str, str] | None = None, + json_event_adapter: JsonEventAdapter | None = None, + ) -> None: + self.bus = bus + self.env = dict(env or os.environ) + self.json_event_adapter = json_event_adapter + self._running: MutableMapping[str, _RunningProcess] = {} + + def register_event_handlers(self) -> None: + self.bus.on(ProcessLaunch, self.on_ProcessLaunch) + self.bus.on(ProcessKill, self.on_ProcessKill) + + async def on_ProcessLaunch(self, event: ProcessLaunch) -> ProcessRecord: + parent_event_id = event.event_id + proc_id = uuid7str() + cwd = event.cwd or event.output_dir or os.getcwd() + output_dir = Path(event.output_dir or cwd) + output_dir.mkdir(parents=True, exist_ok=True) + + env = {**self.env, **(event.env or {})} + + log_prefix = event.log_prefix or proc_id + stdout_path = output_dir / f"{log_prefix}.stdout.log" + stderr_path = output_dir / f"{log_prefix}.stderr.log" + cmd_path = output_dir / f"{log_prefix}.sh" + pid_path = output_dir / f"{log_prefix}.pid" + + self._write_cmd_file(cmd_path, event.cmd) + + proc = await asyncio.create_subprocess_exec( + *event.cmd, + cwd=str(cwd), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, + ) + + self._write_pid_file(pid_path, proc.pid) + + record = ProcessRecord( + id=proc_id, + cmd=event.cmd, + cwd=str(cwd), + env=env, + pid=proc.pid, + started_at=_utcnow(), + stdout_path=str(stdout_path), + stderr_path=str(stderr_path), + cmd_path=str(cmd_path), + pid_path=str(pid_path), + is_background=event.is_background, + parent_process_id=event.parent_process_id, + ) + + await event.event_bus.dispatch( + ProcessStarted(process=record, event_parent_id=parent_event_id), + ) + + stdout_task = asyncio.create_task( + self._consume_stream( + proc.stdout, + stdout_path, + parent_event_id, + event.parse_stdout_events, + ), + ) + stderr_task = asyncio.create_task( + self._consume_stream(proc.stderr, stderr_path, parent_event_id, False), + ) + + running = _RunningProcess( + process=proc, + record=record, + stdout_task=stdout_task, + stderr_task=stderr_task, + watcher_task=None, + parent_event_id=parent_event_id, + ) + self._running[proc_id] = running + + if event.is_background: + running.watcher_task = asyncio.create_task( + self._watch_process(proc_id, event.timeout), + ) + return record + + await self._watch_process(proc_id, event.timeout) + return self._running.get(proc_id, running).record + + async def on_ProcessKill(self, event: ProcessKill) -> ProcessRecord: + running = self._running.get(event.process_id) + if not running: + raise RuntimeError(f"Process not found: {event.process_id}") + + proc = running.process + self._terminate_process(proc, event.signal) + + if event.timeout is not None: + try: + await asyncio.wait_for(proc.wait(), timeout=event.timeout) + except TimeoutError: + self._terminate_process(proc, signal.SIGKILL) + else: + await proc.wait() + + await self._finalize_process(event.process_id) + return self._running.get(event.process_id, running).record + + async def _watch_process(self, process_id: str, timeout: float | None) -> None: + running = self._running.get(process_id) + if not running: + return + proc = running.process + try: + if timeout is not None: + await asyncio.wait_for(proc.wait(), timeout=timeout) + else: + await proc.wait() + except TimeoutError: + self._terminate_process(proc, signal.SIGTERM) + await asyncio.sleep(2) + if proc.returncode is None: + self._terminate_process(proc, signal.SIGKILL) + await proc.wait() + await self._finalize_process(process_id) + + async def _finalize_process(self, process_id: str) -> None: + running = self._running.get(process_id) + if not running: + return + + proc = running.process + record = running.record + + if running.stdout_task: + await running.stdout_task + if running.stderr_task: + await running.stderr_task + + record.exit_code = proc.returncode + record.ended_at = _utcnow() + + await self.bus.dispatch( + ProcessExited(process=record, event_parent_id=running.parent_event_id), + ) + + self._running.pop(process_id, None) + + async def _consume_stream( + self, + stream: asyncio.StreamReader | None, + path: Path, + parent_event_id: str | None, + parse_events: bool, + ) -> None: + if stream is None: + return + with path.open("w", encoding="utf-8") as fh: + while True: + line = await stream.readline() + if not line: + break + text = line.decode("utf-8", errors="replace") + fh.write(text) + fh.flush() + if parse_events: + await self._maybe_dispatch_json_event(text, parent_event_id) + + async def _maybe_dispatch_json_event(self, line: str, parent_event_id: str | None) -> None: + text = line.strip() + if not text.startswith("{") or not text.endswith("}"): + return + try: + data = json.loads(text) + except json.JSONDecodeError: + return + + event = None + if self.json_event_adapter: + event = self.json_event_adapter(data, parent_event_id) + elif isinstance(data, dict) and "event_type" in data: + try: + event = BaseEvent.model_validate(data) + except Exception: + event = None + + if event is None: + return + + if not getattr(event, "event_parent_id", None) and parent_event_id: + event.event_parent_id = parent_event_id + await self.bus.dispatch(event) + + @staticmethod + def _write_cmd_file(path: Path, cmd: list[str]) -> None: + cmd_line = shlex.join(cmd) + path.write_text(cmd_line + "\n", encoding="utf-8") + + @staticmethod + def _write_pid_file(path: Path, pid: int) -> None: + path.write_text(str(pid), encoding="utf-8") + ts = datetime.now().timestamp() + os.utime(path, (ts, ts)) + + @staticmethod + def _terminate_process(proc: asyncio.subprocess.Process, sig: int) -> None: + if proc.returncode is not None: + return + try: + os.killpg(proc.pid, sig) + except Exception: + try: + os.kill(proc.pid, sig) + except Exception: + pass + + +__all__ = [ + "ProcessRecord", + "ProcessLaunch", + "ProcessStarted", + "ProcessExited", + "ProcessKill", + "ProcessPlugin", +] diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py deleted file mode 100644 index 252244f168..0000000000 --- a/archivebox/index/__init__.py +++ /dev/null @@ -1,618 +0,0 @@ -__package__ = 'archivebox.index' - -import os -import shutil -from pathlib import Path - -from itertools import chain -from typing import List, Tuple, Dict, Optional, Iterable -from collections import OrderedDict -from contextlib import contextmanager -from urllib.parse import urlparse -from django.db.models import QuerySet, Q - -from ..util import ( - scheme, - enforce_types, - ExtendedEncoder, -) -from ..config import ( - ARCHIVE_DIR_NAME, - SQL_INDEX_FILENAME, - JSON_INDEX_FILENAME, - OUTPUT_DIR, - TIMEOUT, - URL_BLACKLIST_PTN, - stderr, - OUTPUT_PERMISSIONS -) -from ..logging_util import ( - TimedProgress, - log_indexing_process_started, - log_indexing_process_finished, - log_indexing_started, - log_indexing_finished, - log_parsing_finished, - log_deduping_finished, -) - -from .schema import Link, ArchiveResult -from .html import ( - write_html_link_details, -) -from .json import ( - pyjson, - parse_json_link_details, - write_json_link_details, -) -from .sql import ( - write_sql_main_index, - write_sql_link_details, -) - -from ..search import search_backend_enabled, query_search_index - -### Link filtering and checking - -@enforce_types -def merge_links(a: Link, b: Link) -> Link: - """deterministially merge two links, favoring longer field values over shorter, - and "cleaner" values over worse ones. - """ - assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})' - - # longest url wins (because a fuzzy url will always be shorter) - url = a.url if len(a.url) > len(b.url) else b.url - - # best title based on length and quality - possible_titles = [ - title - for title in (a.title, b.title) - if title and title.strip() and '://' not in title - ] - title = None - if len(possible_titles) == 2: - title = max(possible_titles, key=lambda t: len(t)) - elif len(possible_titles) == 1: - title = possible_titles[0] - - # earliest valid timestamp - timestamp = ( - a.timestamp - if float(a.timestamp or 0) < float(b.timestamp or 0) else - b.timestamp - ) - - # all unique, truthy tags - tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) - ) - tags = ','.join(tags_set) or None - - # all unique source entries - sources = list(set(a.sources + b.sources)) - - # all unique history entries for the combined archive methods - all_methods = set(list(a.history.keys()) + list(a.history.keys())) - history = { - method: (a.history.get(method) or []) + (b.history.get(method) or []) - for method in all_methods - } - for method in all_methods: - deduped_jsons = { - pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder) - for result in history[method] - } - history[method] = list(reversed(sorted( - (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons), - key=lambda result: result.start_ts, - ))) - - return Link( - url=url, - timestamp=timestamp, - title=title, - tags=tags, - sources=sources, - history=history, - ) - - -@enforce_types -def validate_links(links: Iterable[Link]) -> List[Link]: - timer = TimedProgress(TIMEOUT * 4) - try: - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = sorted_links(links) # deterministically sort the links based on timestamp, url - links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls - finally: - timer.end() - - return list(links) - -@enforce_types -def archivable_links(links: Iterable[Link]) -> Iterable[Link]: - """remove chrome://, about:// or other schemed links that cant be archived""" - for link in links: - try: - urlparse(link.url) - except ValueError: - continue - if scheme(link.url) not in ('http', 'https', 'ftp'): - continue - if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): - continue - - yield link - - -@enforce_types -def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]: - """ - ensures that all non-duplicate links have monotonically increasing timestamps - """ - # from core.models import Snapshot - - unique_urls: OrderedDict[str, Link] = OrderedDict() - - for link in sorted_links: - if link.url in unique_urls: - # merge with any other links that share the same url - link = merge_links(unique_urls[link.url], link) - unique_urls[link.url] = link - - return unique_urls.values() - - -@enforce_types -def sorted_links(links: Iterable[Link]) -> Iterable[Link]: - sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) - return sorted(links, key=sort_func, reverse=True) - - -@enforce_types -def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]: - if not resume: - yield from links - return - - for link in links: - try: - if float(link.timestamp) <= resume: - yield link - except (ValueError, TypeError): - print('Resume value and all timestamp values must be valid numbers.') - - -@enforce_types -def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: - """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" - - timestamp = timestamp.split('.')[0] - nonce = 0 - - # first try 152323423 before 152323423.0 - if timestamp not in used_timestamps: - return timestamp - - new_timestamp = '{}.{}'.format(timestamp, nonce) - while new_timestamp in used_timestamps: - nonce += 1 - new_timestamp = '{}.{}'.format(timestamp, nonce) - - return new_timestamp - - - -### Main Links Index - -@contextmanager -@enforce_types -def timed_index_update(out_path: Path): - log_indexing_started(out_path) - timer = TimedProgress(TIMEOUT * 2, prefix=' ') - try: - yield - finally: - timer.end() - - assert out_path.exists(), f'Failed to write index file: {out_path}' - log_indexing_finished(out_path) - - -@enforce_types -def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - """Writes links to sqlite3 file for a given list of links""" - - log_indexing_process_started(len(links)) - - try: - with timed_index_update(out_dir / SQL_INDEX_FILENAME): - write_sql_main_index(links, out_dir=out_dir) - os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes - - except (KeyboardInterrupt, SystemExit): - stderr('[!] Warning: Still writing index to disk...', color='lightyellow') - stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.') - with timed_index_update(out_dir / SQL_INDEX_FILENAME): - write_sql_main_index(links, out_dir=out_dir) - os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes - raise SystemExit(0) - - log_indexing_process_finished() - -@enforce_types -def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: - """parse and load existing index with any new links from import_path merged in""" - from core.models import Snapshot - try: - return Snapshot.objects.all() - - except (KeyboardInterrupt, SystemExit): - raise SystemExit(0) - -@enforce_types -def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]: - index_path = out_dir / JSON_INDEX_FILENAME - if index_path.exists(): - with open(index_path, 'r', encoding='utf-8') as f: - meta_dict = pyjson.load(f) - meta_dict.pop('links') - return meta_dict - - return None - - -@enforce_types -def parse_links_from_source(source_path: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], List[Link]]: - - from ..parsers import parse_links - - new_links: List[Link] = [] - - # parse and validate the import file - raw_links, parser_name = parse_links(source_path, root_url=root_url, parser=parser) - new_links = validate_links(raw_links) - - if parser_name: - num_parsed = len(raw_links) - log_parsing_finished(num_parsed, parser_name) - - return new_links - -@enforce_types -def fix_duplicate_links_in_index(snapshots: QuerySet, links: Iterable[Link]) -> Iterable[Link]: - """ - Given a list of in-memory Links, dedupe and merge them with any conflicting Snapshots in the DB. - """ - unique_urls: OrderedDict[str, Link] = OrderedDict() - - for link in links: - index_link = snapshots.filter(url=link.url) - if index_link: - link = merge_links(index_link[0].as_link(), link) - - unique_urls[link.url] = link - - return unique_urls.values() - -@enforce_types -def dedupe_links(snapshots: QuerySet, - new_links: List[Link]) -> List[Link]: - """ - The validation of links happened at a different stage. This method will - focus on actual deduplication and timestamp fixing. - """ - - # merge existing links in out_dir and new links - dedup_links = fix_duplicate_links_in_index(snapshots, new_links) - - new_links = [ - link for link in new_links - if not snapshots.filter(url=link.url).exists() - ] - - dedup_links_dict = {link.url: link for link in dedup_links} - - # Replace links in new_links with the dedup version - for i in range(len(new_links)): - if new_links[i].url in dedup_links_dict.keys(): - new_links[i] = dedup_links_dict[new_links[i].url] - log_deduping_finished(len(new_links)) - - return new_links - -### Link Details Index - -@enforce_types -def write_link_details(link: Link, out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None: - out_dir = out_dir or link.link_dir - - write_json_link_details(link, out_dir=out_dir) - write_html_link_details(link, out_dir=out_dir) - if not skip_sql_index: - write_sql_link_details(link) - - -@enforce_types -def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: - """check for an existing link archive in the given directory, - and load+merge it into the given link dict - """ - out_dir = out_dir or link.link_dir - - existing_link = parse_json_link_details(out_dir) - if existing_link: - return merge_links(existing_link, link) - - return link - - - -LINK_FILTERS = { - 'exact': lambda pattern: Q(url=pattern), - 'substring': lambda pattern: Q(url__icontains=pattern), - 'regex': lambda pattern: Q(url__iregex=pattern), - 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), - 'tag': lambda pattern: Q(tags__name=pattern), - 'timestamp': lambda pattern: Q(timestamp=pattern), -} - -@enforce_types -def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: - q_filter = Q() - for pattern in filter_patterns: - try: - q_filter = q_filter | LINK_FILTERS[filter_type](pattern) - except KeyError: - stderr() - stderr( - f'[X] Got invalid pattern for --filter-type={filter_type}:', - color='red', - ) - stderr(f' {pattern}') - raise SystemExit(2) - return snapshots.filter(q_filter) - -def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet: - if not search_backend_enabled(): - stderr() - stderr( - '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', - color='red', - ) - raise SystemExit(2) - from core.models import Snapshot - - qsearch = Snapshot.objects.none() - for pattern in filter_patterns: - try: - qsearch |= query_search_index(pattern) - except: - raise SystemExit(2) - - return snapshots & qsearch - -@enforce_types -def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: - if filter_type != 'search': - return q_filter(snapshots, filter_patterns, filter_type) - else: - return search_filter(snapshots, filter_patterns, filter_type) - - -def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links without checking archive status or data directory validity""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in links - } - -def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links that are archived with a valid data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_archived, links) - } - -def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """indexed links that are unarchived with no data directory or an empty data directory""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_unarchived, links) - } - -def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that actually exist in the archive/ folder""" - - all_folders = {} - - for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir(): - if entry.is_dir(): - link = None - try: - link = parse_json_link_details(entry.path) - except Exception: - pass - - all_folders[entry.name] = link - - return all_folders - -def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs with a valid index matched to the main index and archived content""" - links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()] - return { - link.link_dir: link - for link in filter(is_valid, links) - } - -def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" - duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR) - orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR) - corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR) - unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR) - return {**duplicate, **orphaned, **corrupted, **unrecognized} - - -def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that conflict with other directories that have the same link URL or timestamp""" - by_url = {} - by_timestamp = {} - duplicate_folders = {} - - data_folders = ( - str(entry) - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir() - if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() - ) - - for path in chain(snapshots.iterator(), data_folders): - link = None - if type(path) is not str: - path = path.as_link().link_dir - - try: - link = parse_json_link_details(path) - except Exception: - pass - - if link: - # link folder has same timestamp as different link folder - by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1 - if by_timestamp[link.timestamp] > 1: - duplicate_folders[path] = link - - # link folder has same url as different link folder - by_url[link.url] = by_url.get(link.url, 0) + 1 - if by_url[link.url] > 1: - duplicate_folders[path] = link - return duplicate_folders - -def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that contain a valid index but aren't listed in the main index""" - orphaned_folders = {} - - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): - if entry.is_dir(): - link = None - try: - link = parse_json_link_details(str(entry)) - except Exception: - pass - - if link and not snapshots.filter(timestamp=entry.name).exists(): - # folder is a valid link data dir with index details, but it's not in the main index - orphaned_folders[str(entry)] = link - - return orphaned_folders - -def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that don't contain a valid index and aren't listed in the main index""" - corrupted = {} - for snapshot in snapshots.iterator(): - link = snapshot.as_link() - if is_corrupt(link): - corrupted[link.link_dir] = link - return corrupted - -def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - """dirs that don't contain recognizable archive data and aren't listed in the main index""" - unrecognized_folders: Dict[str, Optional[Link]] = {} - - for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): - if entry.is_dir(): - index_exists = (entry / "index.json").exists() - link = None - try: - link = parse_json_link_details(str(entry)) - except KeyError: - # Try to fix index - if index_exists: - try: - # Last attempt to repair the detail index - link_guessed = parse_json_link_details(str(entry), guess=True) - write_json_link_details(link_guessed, out_dir=str(entry)) - link = parse_json_link_details(str(entry)) - except Exception: - pass - - if index_exists and link is None: - # index exists but it's corrupted or unparseable - unrecognized_folders[str(entry)] = link - - elif not index_exists: - # link details index doesn't exist and the folder isn't in the main index - timestamp = entry.name - if not snapshots.filter(timestamp=timestamp).exists(): - unrecognized_folders[str(entry)] = link - - return unrecognized_folders - - -def is_valid(link: Link) -> bool: - dir_exists = Path(link.link_dir).exists() - index_exists = (Path(link.link_dir) / "index.json").exists() - if not dir_exists: - # unarchived links are not included in the valid list - return False - if dir_exists and not index_exists: - return False - if dir_exists and index_exists: - try: - parsed_link = parse_json_link_details(link.link_dir, guess=True) - return link.url == parsed_link.url - except Exception: - pass - return False - -def is_corrupt(link: Link) -> bool: - if not Path(link.link_dir).exists(): - # unarchived links are not considered corrupt - return False - - if is_valid(link): - return False - - return True - -def is_archived(link: Link) -> bool: - return is_valid(link) and link.is_archived - -def is_unarchived(link: Link) -> bool: - if not Path(link.link_dir).exists(): - return True - return not link.is_archived - - -def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]: - fixed = [] - cant_fix = [] - for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME): - if entry.is_dir(follow_symlinks=True): - if (Path(entry.path) / 'index.json').exists(): - try: - link = parse_json_link_details(entry.path) - except KeyError: - link = None - if not link: - continue - - if not entry.path.endswith(f'/{link.timestamp}'): - dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp - if dest.exists(): - cant_fix.append(entry.path) - else: - shutil.move(entry.path, dest) - fixed.append(dest) - timestamp = entry.path.rsplit('/', 1)[-1] - assert link.link_dir == entry.path - assert link.timestamp == timestamp - write_json_link_details(link, out_dir=entry.path) - - return fixed, cant_fix diff --git a/archivebox/index/csv.py b/archivebox/index/csv.py deleted file mode 100644 index 804e64611c..0000000000 --- a/archivebox/index/csv.py +++ /dev/null @@ -1,37 +0,0 @@ -__package__ = 'archivebox.index' - -from typing import List, Optional, Any - -from ..util import enforce_types -from .schema import Link - - -@enforce_types -def links_to_csv(links: List[Link], - cols: Optional[List[str]]=None, - header: bool=True, - separator: str=',', - ljust: int=0) -> str: - - cols = cols or ['timestamp', 'is_archived', 'url'] - - header_str = '' - if header: - header_str = separator.join(col.ljust(ljust) for col in cols) - - row_strs = ( - link.to_csv(cols=cols, ljust=ljust, separator=separator) - for link in links - ) - - return '\n'.join((header_str, *row_strs)) - - -@enforce_types -def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str: - from .json import to_json - - return separator.join( - to_json(getattr(obj, col), indent=None).ljust(ljust) - for col in cols - ) diff --git a/archivebox/index/html.py b/archivebox/index/html.py deleted file mode 100644 index d45f66eaa3..0000000000 --- a/archivebox/index/html.py +++ /dev/null @@ -1,193 +0,0 @@ -__package__ = 'archivebox.index' - -from pathlib import Path -from datetime import datetime, timezone -from collections import defaultdict -from typing import List, Optional, Iterator, Mapping - -from django.utils.html import format_html, mark_safe -from django.core.cache import cache - -from .schema import Link -from ..system import atomic_write -from ..logging_util import printable_filesize -from ..util import ( - enforce_types, - ts_to_date_str, - urlencode, - htmlencode, - urldecode, -) -from ..config import ( - OUTPUT_DIR, - VERSION, - FOOTER_INFO, - HTML_INDEX_FILENAME, - SAVE_ARCHIVE_DOT_ORG, -) - -MAIN_INDEX_TEMPLATE = 'static_index.html' -MINIMAL_INDEX_TEMPLATE = 'minimal_index.html' -LINK_DETAILS_TEMPLATE = 'snapshot.html' -TITLE_LOADING_MSG = 'Not yet archived...' - - -### Main Links Index - -@enforce_types -def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]: - """parse an archive index html file and return the list of urls""" - - index_path = Path(out_dir) / HTML_INDEX_FILENAME - if index_path.exists(): - with open(index_path, 'r', encoding='utf-8') as f: - for line in f: - if 'class="link-url"' in line: - yield line.split('"')[1] - return () - -@enforce_types -def generate_index_from_links(links: List[Link], with_headers: bool): - if with_headers: - output = main_index_template(links) - else: - output = main_index_template(links, template=MINIMAL_INDEX_TEMPLATE) - return output - -@enforce_types -def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> str: - """render the template for the entire main index""" - - return render_django_template(template, { - 'version': VERSION, - 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility - 'num_links': str(len(links)), - 'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'), - 'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'), - 'links': [link._asdict(extended=True) for link in links], - 'FOOTER_INFO': FOOTER_INFO, - }) - - -### Link Details Index - -@enforce_types -def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: - out_dir = out_dir or link.link_dir - - rendered_html = link_details_template(link) - atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html) - - -@enforce_types -def link_details_template(link: Link) -> str: - - from ..extractors.wget import wget_output_path - - link_info = link._asdict(extended=True) - - return render_django_template(LINK_DETAILS_TEMPLATE, { - **link_info, - **link_info['canonical'], - 'title': htmlencode( - link.title - or (link.base_url if link.is_archived else TITLE_LOADING_MSG) - ), - 'url_str': htmlencode(urldecode(link.base_url)), - 'archive_url': urlencode( - wget_output_path(link) - or (link.domain if link.is_archived else '') - ) or 'about:blank', - 'extension': link.extension or 'html', - 'tags': link.tags or 'untagged', - 'size': printable_filesize(link.archive_size) if link.archive_size else 'pending', - 'status': 'archived' if link.is_archived else 'not yet archived', - 'status_color': 'success' if link.is_archived else 'danger', - 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), - 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, - }) - -@enforce_types -def render_django_template(template: str, context: Mapping[str, str]) -> str: - """render a given html template string with the given template content""" - from django.template.loader import render_to_string - - return render_to_string(template, context) - - -def snapshot_icons(snapshot) -> str: - cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' - - def calc_snapshot_icons(): - from core.models import EXTRACTORS - # start = datetime.now(timezone.utc) - - archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) - link = snapshot.as_link() - path = link.archive_path - canon = link.canonical_outputs() - output = "" - output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>  ' - icons = { - "singlefile": "âļ", - "wget": "🆆", - "dom": "🅷", - "pdf": "📄", - "screenshot": "đŸ’ģ", - "media": "đŸ“ŧ", - "git": "đŸ…ļ", - "archive_org": "🏛", - "readability": "🆁", - "mercury": "đŸ…ŧ", - "warc": "đŸ“Ļ" - } - exclude = ["favicon", "title", "headers", "archive_org"] - # Missing specific entry for WARC - - extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTORS: - for result in archive_results: - if result.extractor == extractor and result: - extractor_outputs[extractor] = result - - for extractor, _ in EXTRACTORS: - if extractor not in exclude: - existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) - # if existing: - # existing = (Path(path) / existing) - # if existing.is_file(): - # existing = True - # elif existing.is_dir(): - # existing = any(existing.glob('*.*')) - output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), - extractor, icons.get(extractor, "?")) - if extractor == "wget": - # warc isn't technically it's own extractor, so we have to add it after wget - - # get from db (faster but less thurthful) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower but more accurate) - # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) - - if extractor == "archive_org": - # The check for archive_org is different, so it has to be handled separately - - # get from db (faster) - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - # get from filesystem (slower) - # target_path = Path(path) / "archive.org.txt" - # exists = target_path.exists() - output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists), - "archive_org", icons.get("archive_org", "?")) - - result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output)) - # end = datetime.now(timezone.utc) - # print(((end - start).total_seconds()*1000) // 1, 'ms') - return result - - return cache.get_or_set(cache_key, calc_snapshot_icons) - # return calc_snapshot_icons() - - diff --git a/archivebox/index/json.py b/archivebox/index/json.py deleted file mode 100644 index 6585009daf..0000000000 --- a/archivebox/index/json.py +++ /dev/null @@ -1,164 +0,0 @@ -__package__ = 'archivebox.index' - -import os -import sys -import json as pyjson -from pathlib import Path - -from datetime import datetime, timezone -from typing import List, Optional, Iterator, Any, Union - -from .schema import Link -from ..system import atomic_write -from ..util import enforce_types -from ..config import ( - VERSION, - OUTPUT_DIR, - FOOTER_INFO, - DEPENDENCIES, - JSON_INDEX_FILENAME, - ARCHIVE_DIR_NAME, - ANSI -) - - -MAIN_INDEX_HEADER = { - 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.index.json', - 'copyright_info': FOOTER_INFO, - 'meta': { - 'project': 'ArchiveBox', - 'version': VERSION, - 'git_sha': VERSION, # not used anymore, but kept for backwards compatibility - 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', - 'source': 'https://github.com/ArchiveBox/ArchiveBox', - 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', - 'dependencies': DEPENDENCIES, - }, -} - -@enforce_types -def generate_json_index_from_links(links: List[Link], with_headers: bool): - if with_headers: - output = { - **MAIN_INDEX_HEADER, - 'num_links': len(links), - 'updated': datetime.now(timezone.utc), - 'last_run_cmd': sys.argv, - 'links': links, - } - else: - output = links - return to_json(output, indent=4, sort_keys=True) - - -@enforce_types -def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: - """parse an archive index json file and return the list of links""" - - index_path = Path(out_dir) / JSON_INDEX_FILENAME - if index_path.exists(): - with open(index_path, 'r', encoding='utf-8') as f: - try: - links = pyjson.load(f)['links'] - if links: - Link.from_json(links[0]) - except Exception as err: - print(" {lightyellow}! Found an index.json in the project root but couldn't load links from it: {} {}".format( - err.__class__.__name__, - err, - **ANSI, - )) - return () - - for link_json in links: - try: - yield Link.from_json(link_json) - except KeyError: - try: - detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp'] - yield parse_json_link_details(str(detail_index_path)) - except KeyError: - # as a last effort, try to guess the missing values out of existing ones - try: - yield Link.from_json(link_json, guess=True) - except KeyError: - print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI)) - continue - return () - -### Link Details Index - -@enforce_types -def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None: - """write a json file with some info about the link""" - - out_dir = out_dir or link.link_dir - path = Path(out_dir) / JSON_INDEX_FILENAME - atomic_write(str(path), link._asdict(extended=True)) - - -@enforce_types -def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]: - """load the json link index from a given directory""" - existing_index = Path(out_dir) / JSON_INDEX_FILENAME - if existing_index.exists(): - with open(existing_index, 'r', encoding='utf-8') as f: - try: - link_json = pyjson.load(f) - return Link.from_json(link_json, guess) - except pyjson.JSONDecodeError: - pass - return None - - -@enforce_types -def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]: - """read through all the archive data folders and return the parsed links""" - - for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME): - if entry.is_dir(follow_symlinks=True): - if (Path(entry.path) / 'index.json').exists(): - try: - link = parse_json_link_details(entry.path) - except KeyError: - link = None - if link: - yield link - - - -### Helpers - -class ExtendedEncoder(pyjson.JSONEncoder): - """ - Extended json serializer that supports serializing several model - fields and objects - """ - - def default(self, obj): - cls_name = obj.__class__.__name__ - - if hasattr(obj, '_asdict'): - return obj._asdict() - - elif isinstance(obj, bytes): - return obj.decode() - - elif isinstance(obj, datetime): - return obj.isoformat() - - elif isinstance(obj, Exception): - return '{}: {}'.format(obj.__class__.__name__, obj) - - elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): - return tuple(obj) - - return pyjson.JSONEncoder.default(self, obj) - - -@enforce_types -def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str: - return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) - diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py deleted file mode 100644 index 480e9c7f79..0000000000 --- a/archivebox/index/schema.py +++ /dev/null @@ -1,457 +0,0 @@ -""" - -WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED. - -DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py - -""" - -__package__ = 'archivebox.index' - -from pathlib import Path - -from datetime import datetime, timezone, timedelta - -from typing import List, Dict, Any, Optional, Union - -from dataclasses import dataclass, asdict, field, fields - -from django.utils.functional import cached_property - -from ..system import get_dir_size -from ..util import ts_to_date_str, parse_date -from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME - -class ArchiveError(Exception): - def __init__(self, message, hints=None): - super().__init__(message) - self.hints = hints - -LinkDict = Dict[str, Any] - -ArchiveOutput = Union[str, Exception, None] - -@dataclass(frozen=True) -class ArchiveResult: - cmd: List[str] - pwd: Optional[str] - cmd_version: Optional[str] - output: ArchiveOutput - status: str - start_ts: datetime - end_ts: datetime - index_texts: Union[List[str], None] = None - schema: str = 'ArchiveResult' - - def __post_init__(self): - self.typecheck() - - def _asdict(self): - return asdict(self) - - def typecheck(self) -> None: - assert self.schema == self.__class__.__name__ - assert isinstance(self.status, str) and self.status - assert isinstance(self.start_ts, datetime) - assert isinstance(self.end_ts, datetime) - assert isinstance(self.cmd, list) - assert all(isinstance(arg, str) and arg for arg in self.cmd) - - # TODO: replace emptystrings in these three with None / remove them from the DB - assert self.pwd is None or isinstance(self.pwd, str) - assert self.cmd_version is None or isinstance(self.cmd_version, str) - assert self.output is None or isinstance(self.output, (str, Exception)) - - @classmethod - def guess_ts(_cls, dict_info): - from ..util import parse_date - parsed_timestamp = parse_date(dict_info["timestamp"]) - start_ts = parsed_timestamp - end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"])) - return start_ts, end_ts - - @classmethod - def from_json(cls, json_info, guess=False): - from ..util import parse_date - - info = { - key: val - for key, val in json_info.items() - if key in cls.field_names() - } - if guess: - keys = info.keys() - if "start_ts" not in keys: - info["start_ts"], info["end_ts"] = cls.guess_ts(json_info) - else: - info['start_ts'] = parse_date(info['start_ts']) - info['end_ts'] = parse_date(info['end_ts']) - if "pwd" not in keys: - info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"]) - if "cmd_version" not in keys: - info["cmd_version"] = "Undefined" - if "cmd" not in keys: - info["cmd"] = [] - else: - info['start_ts'] = parse_date(info['start_ts']) - info['end_ts'] = parse_date(info['end_ts']) - info['cmd_version'] = info.get('cmd_version') - if type(info["cmd"]) is str: - info["cmd"] = [info["cmd"]] - return cls(**info) - - def to_dict(self, *keys) -> dict: - if keys: - return {k: v for k, v in asdict(self).items() if k in keys} - return asdict(self) - - def to_json(self, indent=4, sort_keys=True) -> str: - from .json import to_json - - return to_json(self, indent=indent, sort_keys=sort_keys) - - def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: - from .csv import to_csv - - return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust) - - @classmethod - def field_names(cls): - return [f.name for f in fields(cls)] - - @property - def duration(self) -> int: - return (self.end_ts - self.start_ts).seconds - -@dataclass(frozen=True) -class Link: - timestamp: str - url: str - title: Optional[str] - tags: Optional[str] - sources: List[str] - history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) - updated: Optional[datetime] = None - schema: str = 'Link' - - def __str__(self) -> str: - return f'[{self.timestamp}] {self.url} "{self.title}"' - - def __post_init__(self): - self.typecheck() - - def overwrite(self, **kwargs): - """pure functional version of dict.update that returns a new instance""" - return Link(**{**self._asdict(), **kwargs}) - - def __eq__(self, other): - if not isinstance(other, Link): - return NotImplemented - return self.url == other.url - - def __gt__(self, other): - if not isinstance(other, Link): - return NotImplemented - if not self.timestamp or not other.timestamp: - return - return float(self.timestamp) > float(other.timestamp) - - def typecheck(self) -> None: - from ..config import stderr, ANSI - try: - assert self.schema == self.__class__.__name__ - assert isinstance(self.timestamp, str) and self.timestamp - assert self.timestamp.replace('.', '').isdigit() - assert isinstance(self.url, str) and '://' in self.url - assert self.updated is None or isinstance(self.updated, datetime) - assert self.title is None or (isinstance(self.title, str) and self.title) - assert self.tags is None or isinstance(self.tags, str) - assert isinstance(self.sources, list) - assert all(isinstance(source, str) and source for source in self.sources) - assert isinstance(self.history, dict) - for method, results in self.history.items(): - assert isinstance(method, str) and method - assert isinstance(results, list) - assert all(isinstance(result, ArchiveResult) for result in results) - except Exception: - stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI)) - raise - - def _asdict(self, extended=False): - info = { - 'schema': 'Link', - 'url': self.url, - 'title': self.title or None, - 'timestamp': self.timestamp, - 'updated': self.updated or None, - 'tags': self.tags or None, - 'sources': self.sources or [], - 'history': self.history or {}, - } - if extended: - info.update({ - 'snapshot_id': self.snapshot_id, - 'link_dir': self.link_dir, - 'archive_path': self.archive_path, - - 'hash': self.url_hash, - 'base_url': self.base_url, - 'scheme': self.scheme, - 'domain': self.domain, - 'path': self.path, - 'basename': self.basename, - 'extension': self.extension, - 'is_static': self.is_static, - - 'tags_str': (self.tags or '').strip(','), # only used to render static index in index/html.py, remove if no longer needed there - 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there - - 'bookmarked_date': self.bookmarked_date, - 'updated_date': self.updated_date, - 'oldest_archive_date': self.oldest_archive_date, - 'newest_archive_date': self.newest_archive_date, - - 'is_archived': self.is_archived, - 'num_outputs': self.num_outputs, - 'num_failures': self.num_failures, - - 'latest': self.latest_outputs(), - 'canonical': self.canonical_outputs(), - }) - return info - - def as_snapshot(self): - from core.models import Snapshot - return Snapshot.objects.get(url=self.url) - - @classmethod - def from_json(cls, json_info, guess=False): - from ..util import parse_date - - info = { - key: val - for key, val in json_info.items() - if key in cls.field_names() - } - info['updated'] = parse_date(info.get('updated')) - info['sources'] = info.get('sources') or [] - - json_history = info.get('history') or {} - cast_history = {} - - for method, method_history in json_history.items(): - cast_history[method] = [] - for json_result in method_history: - assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts' - cast_result = ArchiveResult.from_json(json_result, guess) - cast_history[method].append(cast_result) - - info['history'] = cast_history - return cls(**info) - - def to_json(self, indent=4, sort_keys=True) -> str: - from .json import to_json - - return to_json(self, indent=indent, sort_keys=sort_keys) - - def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str: - from .csv import to_csv - - return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) - - @cached_property - def snapshot_id(self): - from core.models import Snapshot - return str(Snapshot.objects.only('id').get(url=self.url).id) - - @classmethod - def field_names(cls): - return [f.name for f in fields(cls)] - - @property - def link_dir(self) -> str: - from ..config import CONFIG - return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp) - - @property - def archive_path(self) -> str: - from ..config import ARCHIVE_DIR_NAME - return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) - - @property - def archive_size(self) -> float: - try: - return get_dir_size(self.archive_path)[0] - except Exception: - return 0 - - ### URL Helpers - @property - def url_hash(self): - from ..util import hashurl - - return hashurl(self.url) - - @property - def scheme(self) -> str: - from ..util import scheme - return scheme(self.url) - - @property - def extension(self) -> str: - from ..util import extension - return extension(self.url) - - @property - def domain(self) -> str: - from ..util import domain - return domain(self.url) - - @property - def path(self) -> str: - from ..util import path - return path(self.url) - - @property - def basename(self) -> str: - from ..util import basename - return basename(self.url) - - @property - def base_url(self) -> str: - from ..util import base_url - return base_url(self.url) - - ### Pretty Printing Helpers - @property - def bookmarked_date(self) -> Optional[str]: - max_ts = (datetime.now(timezone.utc) + timedelta(days=30)).timestamp() - - if self.timestamp and self.timestamp.replace('.', '').isdigit(): - if 0 < float(self.timestamp) < max_ts: - return ts_to_date_str(datetime.fromtimestamp(float(self.timestamp))) - else: - return str(self.timestamp) - return None - - - @property - def updated_date(self) -> Optional[str]: - return ts_to_date_str(self.updated) if self.updated else None - - @property - def archive_dates(self) -> List[datetime]: - return [ - parse_date(result.start_ts) - for method in self.history.keys() - for result in self.history[method] - ] - - @property - def oldest_archive_date(self) -> Optional[datetime]: - return min(self.archive_dates, default=None) - - @property - def newest_archive_date(self) -> Optional[datetime]: - return max(self.archive_dates, default=None) - - ### Archive Status Helpers - @property - def num_outputs(self) -> int: - return self.as_snapshot().num_outputs - - @property - def num_failures(self) -> int: - return sum(1 - for method in self.history.keys() - for result in self.history[method] - if result.status == 'failed') - - @property - def is_static(self) -> bool: - from ..util import is_static_file - return is_static_file(self.url) - - @property - def is_archived(self) -> bool: - from ..config import ARCHIVE_DIR - from ..util import domain - - output_paths = ( - domain(self.url), - 'output.pdf', - 'screenshot.png', - 'output.html', - 'media', - 'singlefile.html' - ) - - return any( - (Path(ARCHIVE_DIR) / self.timestamp / path).exists() - for path in output_paths - ) - - def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]: - """get the latest output that each archive method produced for link""" - - ARCHIVE_METHODS = ( - 'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf', - 'screenshot', 'dom', 'git', 'media', 'archive_org', - ) - latest: Dict[str, ArchiveOutput] = {} - for archive_method in ARCHIVE_METHODS: - # get most recent succesful result in history for each archive method - history = self.history.get(archive_method) or [] - history = list(filter(lambda result: result.output, reversed(history))) - if status is not None: - history = list(filter(lambda result: result.status == status, history)) - - history = list(history) - if history: - latest[archive_method] = history[0].output - else: - latest[archive_method] = None - return latest - - - def canonical_outputs(self) -> Dict[str, Optional[str]]: - """predict the expected output paths that should be present after archiving""" - - from ..extractors.wget import wget_output_path - # TODO: banish this awful duplication from the codebase and import these - # from their respective extractor files - canonical = { - 'index_path': 'index.html', - 'favicon_path': 'favicon.ico', - 'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain), - 'wget_path': wget_output_path(self), - 'warc_path': 'warc/', - 'singlefile_path': 'singlefile.html', - 'readability_path': 'readability/content.html', - 'mercury_path': 'mercury/content.html', - 'pdf_path': 'output.pdf', - 'screenshot_path': 'screenshot.png', - 'dom_path': 'output.html', - 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url), - 'git_path': 'git/', - 'media_path': 'media/', - 'headers_path': 'headers.json', - } - if self.is_static: - # static binary files like PDF and images are handled slightly differently. - # they're just downloaded once and aren't archived separately multiple times, - # so the wget, screenshot, & pdf urls should all point to the same file - - static_path = wget_output_path(self) - canonical.update({ - 'title': self.basename, - 'wget_path': static_path, - 'pdf_path': static_path, - 'screenshot_path': static_path, - 'dom_path': static_path, - 'singlefile_path': static_path, - 'readability_path': static_path, - 'mercury_path': static_path, - }) - return canonical - diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py deleted file mode 100644 index 2fcabd61e6..0000000000 --- a/archivebox/index/sql.py +++ /dev/null @@ -1,146 +0,0 @@ -__package__ = 'archivebox.index' - -from io import StringIO -from pathlib import Path -from typing import List, Tuple, Iterator -from django.db.models import QuerySet -from django.db import transaction - -from .schema import Link -from ..util import enforce_types, parse_date -from ..config import OUTPUT_DIR - - -### Main Links Index - -@enforce_types -def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]: - from core.models import Snapshot - - return ( - Link.from_json(page.as_json(*Snapshot.keys)) - for page in Snapshot.objects.all() - ) - -@enforce_types -def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None: - if atomic: - with transaction.atomic(): - return snapshots.delete() - return snapshots.delete() - -@enforce_types -def write_link_to_sql_index(link: Link): - from core.models import Snapshot, ArchiveResult - info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - tags = info.pop("tags") - if tags is None: - tags = [] - - try: - info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp - except Snapshot.DoesNotExist: - while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): - info["timestamp"] = str(float(info["timestamp"]) + 1.0) - - snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info) - snapshot.save_tags(tags) - - for extractor, entries in link.history.items(): - for entry in entries: - if isinstance(entry, dict): - result, _ = ArchiveResult.objects.get_or_create( - snapshot_id=snapshot.id, - extractor=extractor, - start_ts=parse_date(entry['start_ts']), - defaults={ - 'end_ts': parse_date(entry['end_ts']), - 'cmd': entry['cmd'], - 'output': entry['output'], - 'cmd_version': entry.get('cmd_version') or 'unknown', - 'pwd': entry['pwd'], - 'status': entry['status'], - } - ) - else: - result, _ = ArchiveResult.objects.update_or_create( - snapshot_id=snapshot.id, - extractor=extractor, - start_ts=parse_date(entry.start_ts), - defaults={ - 'end_ts': parse_date(entry.end_ts), - 'cmd': entry.cmd, - 'output': entry.output, - 'cmd_version': entry.cmd_version or 'unknown', - 'pwd': entry.pwd, - 'status': entry.status, - } - ) - - return snapshot - - -@enforce_types -def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None: - for link in links: - # with transaction.atomic(): - # write_link_to_sql_index(link) - write_link_to_sql_index(link) - - -@enforce_types -def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: - from core.models import Snapshot - - # with transaction.atomic(): - # try: - # snap = Snapshot.objects.get(url=link.url) - # except Snapshot.DoesNotExist: - # snap = write_link_to_sql_index(link) - # snap.title = link.title - try: - snap = Snapshot.objects.get(url=link.url) - except Snapshot.DoesNotExist: - snap = write_link_to_sql_index(link) - snap.title = link.title - - tag_set = ( - set(tag.strip() for tag in (link.tags or '').split(',')) - ) - tag_list = list(tag_set) or [] - - snap.save() - snap.save_tags(tag_list) - - - -@enforce_types -def list_migrations(out_dir: Path=OUTPUT_DIR) -> List[Tuple[bool, str]]: - from django.core.management import call_command - out = StringIO() - call_command("showmigrations", list=True, stdout=out) - out.seek(0) - migrations = [] - for line in out.readlines(): - if line.strip() and ']' in line: - status_str, name_str = line.strip().split(']', 1) - is_applied = 'X' in status_str - migration_name = name_str.strip() - migrations.append((is_applied, migration_name)) - - return migrations - -@enforce_types -def apply_migrations(out_dir: Path=OUTPUT_DIR) -> List[str]: - from django.core.management import call_command - null, out = StringIO(), StringIO() - call_command("makemigrations", interactive=False, stdout=null) - call_command("migrate", interactive=False, stdout=out) - out.seek(0) - - return [line.strip() for line in out.readlines() if line.strip()] - -@enforce_types -def get_admins(out_dir: Path=OUTPUT_DIR) -> List[str]: - from django.contrib.auth.models import User - return User.objects.filter(is_superuser=True) diff --git a/archivebox/ldap/__init__.py b/archivebox/ldap/__init__.py new file mode 100644 index 0000000000..560f3460e1 --- /dev/null +++ b/archivebox/ldap/__init__.py @@ -0,0 +1,17 @@ +""" +LDAP authentication module for ArchiveBox. + +This module provides native LDAP authentication support using django-auth-ldap. +It only activates if: +1. LDAP_ENABLED=True in config +2. Required LDAP libraries (python-ldap, django-auth-ldap) are installed + +To install LDAP dependencies: + pip install archivebox[ldap] + +Or manually: + apt install build-essential python3-dev libsasl2-dev libldap2-dev libssl-dev + pip install python-ldap django-auth-ldap +""" + +__package__ = "archivebox.ldap" diff --git a/archivebox/ldap/apps.py b/archivebox/ldap/apps.py new file mode 100644 index 0000000000..54390c6232 --- /dev/null +++ b/archivebox/ldap/apps.py @@ -0,0 +1,13 @@ +"""Django app configuration for LDAP authentication.""" + +__package__ = "archivebox.ldap" + +from django.apps import AppConfig + + +class LDAPConfig(AppConfig): + """Django app config for LDAP authentication.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.ldap" + verbose_name = "LDAP Authentication" diff --git a/archivebox/ldap/auth.py b/archivebox/ldap/auth.py new file mode 100644 index 0000000000..dd1ac4626c --- /dev/null +++ b/archivebox/ldap/auth.py @@ -0,0 +1,50 @@ +""" +LDAP authentication backend for ArchiveBox. + +This module extends django-auth-ldap to support the LDAP_CREATE_SUPERUSER flag. +""" + +__package__ = "archivebox.ldap" + +import importlib + +try: + BaseLDAPBackend = importlib.import_module("django_auth_ldap.backend").LDAPBackend +except ImportError: + + class BaseLDAPBackend: + """Dummy LDAP backend when django-auth-ldap is not installed.""" + + pass + + +class ArchiveBoxLDAPBackend(BaseLDAPBackend): + """ + Custom LDAP authentication backend for ArchiveBox. + + Extends django-auth-ldap's LDAPBackend to support: + - LDAP_CREATE_SUPERUSER: Automatically grant superuser privileges to LDAP users + """ + + def authenticate_ldap_user(self, ldap_user, password): + """ + Authenticate using LDAP and optionally grant superuser privileges. + + This method is called by django-auth-ldap after successful LDAP authentication. + """ + from archivebox.config.ldap import LDAP_CONFIG + + base_authenticate = getattr(super(), "authenticate_ldap_user", None) + if base_authenticate is None: + return None + + user = base_authenticate(ldap_user, password) + + if user and LDAP_CONFIG.LDAP_CREATE_SUPERUSER: + # Grant superuser privileges to all LDAP-authenticated users + if not user.is_superuser: + user.is_superuser = True + user.is_staff = True + user.save() + + return user diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py deleted file mode 100644 index 6be14f029f..0000000000 --- a/archivebox/logging_util.py +++ /dev/null @@ -1,636 +0,0 @@ -__package__ = 'archivebox' - -import re -import os -import sys -import stat -import time -import argparse -from math import log -from multiprocessing import Process -from pathlib import Path - -from datetime import datetime, timezone -from dataclasses import dataclass -from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING - -if TYPE_CHECKING: - from .index.schema import Link, ArchiveResult - -from .system import get_dir_size -from .util import enforce_types -from .config import ( - ConfigDict, - OUTPUT_DIR, - PYTHON_ENCODING, - VERSION, - ANSI, - IS_TTY, - IN_DOCKER, - TERM_WIDTH, - SHOW_PROGRESS, - SOURCES_DIR_NAME, - stderr, -) - -@dataclass -class RuntimeStats: - """mutable stats counter for logging archiving timing info to CLI output""" - - skipped: int = 0 - succeeded: int = 0 - failed: int = 0 - - parse_start_ts: Optional[datetime] = None - parse_end_ts: Optional[datetime] = None - - index_start_ts: Optional[datetime] = None - index_end_ts: Optional[datetime] = None - - archiving_start_ts: Optional[datetime] = None - archiving_end_ts: Optional[datetime] = None - -# globals are bad, mmkay -_LAST_RUN_STATS = RuntimeStats() - - -def debug_dict_summary(obj: Dict[Any, Any]) -> None: - stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items())) - - -def get_fd_info(fd) -> Dict[str, Any]: - NAME = fd.name[1:-1] - FILENO = fd.fileno() - MODE = os.fstat(FILENO).st_mode - IS_TTY = hasattr(fd, 'isatty') and fd.isatty() - IS_PIPE = stat.S_ISFIFO(MODE) - IS_FILE = stat.S_ISREG(MODE) - IS_TERMINAL = not (IS_PIPE or IS_FILE) - IS_LINE_BUFFERED = fd.line_buffering - IS_READABLE = fd.readable() - return { - 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE, - 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE, - 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED, - 'IS_READABLE': IS_READABLE, - } - - -# # Log debug information about stdin, stdout, and stderr -# sys.stdout.write('[>&1] this is python stdout\n') -# sys.stderr.write('[>&2] this is python stderr\n') - -# debug_dict_summary(get_fd_info(sys.stdin)) -# debug_dict_summary(get_fd_info(sys.stdout)) -# debug_dict_summary(get_fd_info(sys.stderr)) - - - -class SmartFormatter(argparse.HelpFormatter): - """Patched formatter that prints newlines in argparse help strings""" - def _split_lines(self, text, width): - if '\n' in text: - return text.splitlines() - return argparse.HelpFormatter._split_lines(self, text, width) - - -def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None: - """Tell the user they passed stdin to a command that doesn't accept it""" - - if not stdin: - return None - - if IN_DOCKER: - # when TTY is disabled in docker we cant tell if stdin is being piped in or not - # if we try to read stdin when its not piped we will hang indefinitely waiting for it - return None - - if not stdin.isatty(): - # stderr('READING STDIN TO REJECT...') - stdin_raw_text = stdin.read() - if stdin_raw_text: - # stderr('GOT STDIN!', len(stdin_str)) - stderr(f'[X] The "{caller}" command does not accept stdin.', color='red') - stderr(f' Run archivebox "{caller} --help" to see usage and examples.') - stderr() - raise SystemExit(1) - return None - - -def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]: - """accept any standard input and return it as a string or None""" - - if not stdin: - return None - - if not stdin.isatty(): - # stderr('READING STDIN TO ACCEPT...') - stdin_str = stdin.read() - - if stdin_str: - # stderr('GOT STDIN...', len(stdin_str)) - return stdin_str - - return None - - -class TimedProgress: - """Show a progress bar and measure elapsed time until .end() is called""" - - def __init__(self, seconds, prefix=''): - - self.SHOW_PROGRESS = SHOW_PROGRESS - if self.SHOW_PROGRESS: - self.p = Process(target=progress_bar, args=(seconds, prefix)) - self.p.start() - - self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None} - - def end(self): - """immediately end progress, clear the progressbar line, and save end_ts""" - - - end_ts = datetime.now(timezone.utc) - self.stats['end_ts'] = end_ts - - if self.SHOW_PROGRESS: - # terminate if we havent already terminated - try: - # kill the progress bar subprocess - try: - self.p.close() # must be closed *before* its terminnated - except (KeyboardInterrupt, SystemExit): - print() - raise - except BaseException: # lgtm [py/catch-base-exception] - pass - self.p.terminate() - self.p.join() - - - # clear whole terminal line - try: - sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) - except (IOError, BrokenPipeError): - # ignore when the parent proc has stopped listening to our stdout - pass - except ValueError: - pass - - -@enforce_types -def progress_bar(seconds: int, prefix: str='') -> None: - """show timer in the form of progress bar, with percentage and seconds remaining""" - chunk = '█' if PYTHON_ENCODING == 'UTF-8' else '#' - last_width = TERM_WIDTH() - chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) - try: - for s in range(seconds * chunks): - max_width = TERM_WIDTH() - if max_width < last_width: - # when the terminal size is shrunk, we have to write a newline - # otherwise the progress bar will keep wrapping incorrectly - sys.stdout.write('\r\n') - sys.stdout.flush() - chunks = max_width - len(prefix) - 20 - pct_complete = s / chunks / seconds * 100 - log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;) - bar_width = round(log_pct/(100/chunks)) - last_width = max_width - - # ████████████████████ 0.9% (1/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['green' if pct_complete < 80 else 'lightyellow'], - (chunk * bar_width).ljust(chunks), - ANSI['reset'], - round(pct_complete, 1), - round(s/chunks), - seconds, - )) - sys.stdout.flush() - time.sleep(1 / chunks) - - # ██████████████████████████████████ 100.0% (60/60sec) - sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( - prefix, - ANSI['red'], - chunk * chunks, - ANSI['reset'], - 100.0, - seconds, - seconds, - )) - sys.stdout.flush() - # uncomment to have it disappear when it hits 100% instead of staying full red: - # time.sleep(0.5) - # sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset'])) - # sys.stdout.flush() - except (KeyboardInterrupt, BrokenPipeError): - print() - - -def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str): - cmd = ' '.join(('archivebox', subcommand, *subcommand_args)) - stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format( - now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), - VERSION=VERSION, - cmd=cmd, - **ANSI, - )) - stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI)) - stderr() - -### Parsing Stage - - -def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool): - _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc) - print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format( - _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'), - len(urls) if isinstance(urls, list) else len(urls.split('\n')), - depth, - ' (index only)' if index_only else '', - **ANSI, - )) - -def log_source_saved(source_file: str): - print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1])) - -def log_parsing_finished(num_parsed: int, parser_name: str): - _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc) - print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name)) - -def log_deduping_finished(num_new_links: int): - print(' > Found {} new URLs not already in index'.format(num_new_links)) - - -def log_crawl_started(new_links): - print() - print('{green}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI)) - -### Indexing Stage - -def log_indexing_process_started(num_links: int): - start_ts = datetime.now(timezone.utc) - _LAST_RUN_STATS.index_start_ts = start_ts - print() - print('{black}[*] [{}] Writing {} links to main index...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - **ANSI, - )) - - -def log_indexing_process_finished(): - end_ts = datetime.now(timezone.utc) - _LAST_RUN_STATS.index_end_ts = end_ts - - -def log_indexing_started(out_path: str): - if IS_TTY: - sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}') - - -def log_indexing_finished(out_path: str): - print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}') - - -### Archiving Stage - -def log_archiving_started(num_links: int, resume: Optional[float]=None): - - start_ts = datetime.now(timezone.utc) - _LAST_RUN_STATS.archiving_start_ts = start_ts - print() - if resume: - print('{green}[â–ļ] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - resume, - **ANSI, - )) - else: - print('{green}[â–ļ] [{}] Starting archiving of {} snapshots in index...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - **ANSI, - )) - -def log_archiving_paused(num_links: int, idx: int, timestamp: str): - - end_ts = datetime.now(timezone.utc) - _LAST_RUN_STATS.archiving_end_ts = end_ts - print() - print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( - **ANSI, - now=end_ts.strftime('%Y-%m-%d %H:%M:%S'), - idx=idx+1, - timestamp=timestamp, - total=num_links, - )) - print() - print(' Continue archiving where you left off by running:') - print(' archivebox update --resume={}'.format(timestamp)) - -def log_archiving_finished(num_links: int): - - from core.models import Snapshot - - end_ts = datetime.now(timezone.utc) - _LAST_RUN_STATS.archiving_end_ts = end_ts - assert _LAST_RUN_STATS.archiving_start_ts is not None - seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp() - if seconds > 60: - duration = '{0:.2f} min'.format(seconds / 60) - else: - duration = '{0:.2f} sec'.format(seconds) - - print() - print('{}[√] [{}] Update of {} pages complete ({}){}'.format( - ANSI['green'], - end_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - duration, - ANSI['reset'], - )) - print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped)) - print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed)) - print(' - {} links had errors'.format(_LAST_RUN_STATS.failed)) - - if Snapshot.objects.count() < 50: - print() - print(' {lightred}Hint:{reset} To manage your archive in a Web UI, run:'.format(**ANSI)) - print(' archivebox server 0.0.0.0:8000') - - -def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool): - - # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" - # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ - # > output/archive/1478739709 - - print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format( - symbol_color=ANSI['green' if is_new else 'black'], - symbol='+' if is_new else '√', - now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), - title=link.title or link.base_url, - **ANSI, - )) - print(' {blue}{url}{reset}'.format(url=link.url, **ANSI)) - print(' {} {}'.format( - '>' if is_new else '√', - pretty_path(link_dir), - )) - -def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict, start_ts: datetime): - total = sum(stats.values()) - - if stats['failed'] > 0 : - _LAST_RUN_STATS.failed += 1 - elif stats['skipped'] == total: - _LAST_RUN_STATS.skipped += 1 - else: - _LAST_RUN_STATS.succeeded += 1 - - size = get_dir_size(link_dir) - end_ts = datetime.now(timezone.utc) - duration = str(end_ts - start_ts).split('.')[0] - print(' {black}{} files ({}) in {}s {reset}'.format(size[2], printable_filesize(size[0]), duration, **ANSI)) - - -def log_archive_method_started(method: str): - print(' > {}'.format(method)) - - -def log_archive_method_finished(result: "ArchiveResult"): - """quote the argument with whitespace in a command so the user can - copy-paste the outputted string directly to run the cmd - """ - # Prettify CMD string and make it safe to copy-paste by quoting arguments - quoted_cmd = ' '.join( - '"{}"'.format(arg) if ' ' in arg else arg - for arg in result.cmd - ) - - if result.status == 'failed': - if result.output.__class__.__name__ == 'TimeoutExpired': - duration = (result.end_ts - result.start_ts).seconds - hint_header = [ - '{lightyellow}Extractor timed out after {}s.{reset}'.format(duration, **ANSI), - ] - else: - hint_header = [ - '{lightyellow}Extractor failed:{reset}'.format(**ANSI), - ' {reset}{} {red}{}{reset}'.format( - result.output.__class__.__name__.replace('ArchiveError', ''), - result.output, - **ANSI, - ), - ] - - # Prettify error output hints string and limit to five lines - hints = getattr(result.output, 'hints', None) or () - if hints: - hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') - hints = ( - ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) - for line in hints[:5] if line.strip() - ) - - - # Collect and prefix output lines with indentation - output_lines = [ - *hint_header, - *hints, - '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']), - *([' cd {};'.format(result.pwd)] if result.pwd else []), - ' {}'.format(quoted_cmd), - ] - print('\n'.join( - ' {}'.format(line) - for line in output_lines - if line - )) - print() - - -def log_list_started(filter_patterns: Optional[List[str]], filter_type: str): - print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format( - filter_type, - **ANSI, - )) - print(' {}'.format(' '.join(filter_patterns or ()))) - -def log_list_finished(links): - from .index.csv import links_to_csv - print() - print('---------------------------------------------------------------------------------------------------') - print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) - print('---------------------------------------------------------------------------------------------------') - print() - - -def log_removal_started(links: List["Link"], yes: bool, delete: bool): - print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI)) - if delete: - file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()] - print( - f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n' - f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)' - ) - else: - print( - ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n' - ' (Pass --delete if you also want to permanently delete the data folders)' - ) - - if not yes: - print() - print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI)) - try: - assert input(' y/[n]: ').lower() == 'y' - except (KeyboardInterrupt, EOFError, AssertionError): - raise SystemExit(0) - -def log_removal_finished(all_links: int, to_remove: int): - if all_links == 0: - print() - print('{red}[X] No matching links found.{reset}'.format(**ANSI)) - else: - print() - print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format( - to_remove, - all_links, - **ANSI, - )) - print(' Index now contains {} links.'.format(all_links - to_remove)) - - -def log_shell_welcome_msg(): - from .cli import list_subcommands - - print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) - print('{green}from core.models import Snapshot, User{reset}'.format(**ANSI)) - print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) - print() - print('[i] Welcome to the ArchiveBox Shell!') - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage') - print() - print(' {lightred}Hint:{reset} Example use:'.format(**ANSI)) - print(' print(Snapshot.objects.filter(is_archived=True).count())') - print(' Snapshot.objects.get(url="https://example.com").as_json()') - print(' add("https://example.com/some/new/url")') - - - -### Helpers - -@enforce_types -def pretty_path(path: Union[Path, str]) -> str: - """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - pwd = Path('.').resolve() - # parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) - return str(path).replace(str(pwd) + '/', './') - - -@enforce_types -def printable_filesize(num_bytes: Union[int, float]) -> str: - for count in ['Bytes','KB','MB','GB']: - if num_bytes > -1024.0 and num_bytes < 1024.0: - return '%3.1f %s' % (num_bytes, count) - num_bytes /= 1024.0 - return '%3.1f %s' % (num_bytes, 'TB') - - -@enforce_types -def printable_folders(folders: Dict[str, Optional["Link"]], - with_headers: bool=False) -> str: - return '\n'.join( - f'{folder} {link and link.url} "{link and link.title}"' - for folder, link in folders.items() - ) - - - -@enforce_types -def printable_config(config: ConfigDict, prefix: str='') -> str: - return f'\n{prefix}'.join( - f'{key}={val}' - for key, val in config.items() - if not (isinstance(val, dict) or callable(val)) - ) - - -@enforce_types -def printable_folder_status(name: str, folder: Dict) -> str: - if folder['enabled']: - if folder['is_valid']: - color, symbol, note = 'green', '√', 'valid' - else: - color, symbol, note, num_files = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' - - if folder['path']: - if Path(folder['path']).exists(): - num_files = ( - f'{len(os.listdir(folder["path"]))} files' - if Path(folder['path']).is_dir() else - printable_filesize(Path(folder['path']).stat().st_size) - ) - else: - num_files = 'missing' - - path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else '' - if path and ' ' in path: - path = f'"{path}"' - - # if path is just a plain dot, replace it back with the full path for clarity - if path == '.': - path = str(OUTPUT_DIR) - - return ' '.join(( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(21), - num_files.ljust(14), - ANSI[color], - note.ljust(8), - ANSI['reset'], - path.ljust(76), - )) - - -@enforce_types -def printable_dependency_version(name: str, dependency: Dict) -> str: - version = None - if dependency['enabled']: - if dependency['is_valid']: - color, symbol, note, version = 'green', '√', 'valid', '' - - parsed_version_num = re.search(r'[\d\.]+', dependency['version']) - if parsed_version_num: - version = f'v{parsed_version_num[0]}' - - if not version: - color, symbol, note, version = 'red', 'X', 'invalid', '?' - else: - color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' - - path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else '' - if path and ' ' in path: - path = f'"{path}"' - - return ' '.join(( - ANSI[color], - symbol, - ANSI['reset'], - name.ljust(21), - version.ljust(14), - ANSI[color], - note.ljust(8), - ANSI['reset'], - path.ljust(76), - )) diff --git a/archivebox/machine/__init__.py b/archivebox/machine/__init__.py new file mode 100644 index 0000000000..36a1de6e7a --- /dev/null +++ b/archivebox/machine/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.machine" diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py new file mode 100644 index 0000000000..ca0ad0b2b9 --- /dev/null +++ b/archivebox/machine/admin.py @@ -0,0 +1,659 @@ +__package__ = "archivebox.machine" + +import json +import shlex + +from django.contrib import admin, messages +from django.db.models import DurationField, ExpressionWrapper, F +from django.db.models.functions import Coalesce, Now +from django.shortcuts import redirect +from django.utils import timezone +from django.utils.html import format_html +from django_object_actions import action + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.misc.logging_util import printable_filesize +from archivebox.machine.env_utils import env_to_dotenv_text +from archivebox.machine.models import Machine, NetworkInterface, Binary, Process + + +def _render_copy_block(text: str, *, multiline: bool = False): + if multiline: + return format_html( + """ + <div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;"> + <button type="button" + data-command="{}" + onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;" + style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;"> + Copy + </button> + <pre title="{}" style="display: block; width: 100%; max-width: 100%; overflow: auto; max-height: 300px; margin: 0; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; line-height: 1.45; white-space: pre-wrap; word-break: break-word; box-sizing: border-box;">{}</pre> + </div> + """, + text, + text, + text, + ) + return format_html( + """ + <div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;"> + <button type="button" + data-command="{}" + onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;" + style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;"> + Copy + </button> + <code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;"> + {} + </code> + </div> + """, + text, + text, + text, + ) + + +def _format_process_duration_seconds(started_at, ended_at) -> str: + if not started_at: + return "-" + + end_time = ended_at or timezone.now() + seconds = max((end_time - started_at).total_seconds(), 0.0) + if seconds < 1: + return f"{seconds:.2f}s" + if seconds < 10 and seconds != int(seconds): + return f"{seconds:.1f}s" + return f"{int(seconds)}s" + + +class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): + list_display = ( + "id", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + ) + + readonly_fields = ("guid", "created_at", "modified_at", "ips") + + fieldsets = ( + ( + "Identity", + { + "fields": ("hostname", "guid", "ips"), + "classes": ("card",), + }, + ), + ( + "Hardware", + { + "fields": ("hw_manufacturer", "hw_product", "hw_uuid", "hw_in_docker", "hw_in_vm"), + "classes": ("card",), + }, + ), + ( + "Operating System", + { + "fields": ("os_platform", "os_family", "os_arch", "os_kernel", "os_release"), + "classes": ("card",), + }, + ), + ( + "Statistics", + { + "fields": ("stats", "num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("config",), + "classes": ("card", "wide"), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("hw_in_docker", "hw_in_vm", "os_arch", "os_family", "os_platform") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Public IP", ordering="networkinterface__ip_public") + def ips(self, machine): + return format_html( + '<a href="/admin/machine/networkinterface/?q={}"><b><code>{}</code></b></a>', + machine.id, + ", ".join(machine.networkinterface_set.values_list("ip_public", flat=True)), + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('<span style="color: {};">{}</span>', color, h) + + +class NetworkInterfaceAdmin(BaseModelAdmin): + list_display = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + ) + search_fields = ( + "id", + "machine__id", + "iface", + "ip_public", + "ip_local", + "mac_address", + "dns_server", + "hostname", + "isp", + "city", + "region", + "country", + ) + + readonly_fields = ("machine", "created_at", "modified_at", "mac_address", "ip_public", "ip_local", "dns_server") + + fieldsets = ( + ( + "Machine", + { + "fields": ("machine",), + "classes": ("card",), + }, + ), + ( + "Network", + { + "fields": ("iface", "ip_public", "ip_local", "mac_address", "dns_server"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("hostname", "isp", "city", "region", "country"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("isp", "country", "region") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, iface): + return format_html( + '<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b>   {}</a>', + iface.machine.id, + str(iface.machine.id)[:8], + iface.machine.hostname, + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('<span style="color: {};">{}</span>', color, h) + + +class BinaryAdmin(BaseModelAdmin): + list_display = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status", "health_display") + sort_fields = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status") + search_fields = ("id", "machine__id", "name", "binprovider", "version", "abspath", "sha256") + + readonly_fields = ("created_at", "modified_at", "output_dir") + + fieldsets = ( + ( + "Binary Info", + { + "fields": ("name", "binproviders", "binprovider", "overrides"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("machine", "abspath"), + "classes": ("card",), + }, + ), + ( + "Version", + { + "fields": ("version", "sha256"), + "classes": ("card",), + }, + ), + ( + "State", + { + "fields": ("status", "retry_at", "output_dir"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("name", "binprovider", "status", "machine_id") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, binary): + return format_html( + '<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b>   {}</a>', + binary.machine.id, + str(binary.machine.id)[:8], + binary.machine.hostname, + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('<span style="color: {};">{}</span>', color, h) + + +class ProcessAdmin(BaseModelAdmin): + list_display = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + search_fields = ("id", "machine__id", "binary__name", "cmd", "pwd", "stdout", "stderr") + + readonly_fields = ( + "created_at", + "modified_at", + "machine", + "binary_link", + "iface_link", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_display", + "env_display", + "timeout", + "pid", + "exit_code", + "url", + "started_at", + "ended_at", + "duration_display", + ) + + fieldsets = ( + ( + "Process Info", + { + "fields": ("machine", "archiveresult_link", "snapshot_link", "crawl_link", "status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd_display", "pwd", "env_display", "timeout"), + "classes": ("card", "wide"), + }, + ), + ( + "Execution", + { + "fields": ("binary_link", "iface_link", "pid", "exit_code", "url"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("started_at", "ended_at", "duration_display"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("stdout", "stderr"), + "classes": ("card", "wide", "collapse"), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("status", "exit_code", "machine_id") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["kill_processes", "delete_selected"] + change_actions = ["kill_process"] + + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related( + "machine", + "binary", + "iface", + "archiveresult__snapshot__crawl", + ) + .annotate( + runtime_sort=ExpressionWrapper( + Coalesce(F("ended_at"), Now()) - F("started_at"), + output_field=DurationField(), + ), + ) + ) + + def _terminate_processes(self, request, processes): + terminated = 0 + skipped = 0 + + for process in processes: + if process.status == Process.StatusChoices.EXITED or not process.is_running: + skipped += 1 + continue + if process.terminate(): + terminated += 1 + else: + skipped += 1 + + if terminated: + self.message_user( + request, + f"Killed {terminated} running process{'es' if terminated != 1 else ''}.", + level=messages.SUCCESS, + ) + if skipped: + self.message_user( + request, + f"Skipped {skipped} process{'es' if skipped != 1 else ''} that were already exited.", + level=messages.INFO, + ) + + return terminated, skipped + + @admin.action(description="Kill selected processes") + def kill_processes(self, request, queryset): + self._terminate_processes(request, queryset) + + @action( + label="Kill", + description="Kill this process if it is still running", + attrs={"class": "deletelink"}, + ) + def kill_process(self, request, obj): + self._terminate_processes(request, [obj]) + return redirect("admin:machine_process_change", obj.pk) + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, process): + return format_html( + '<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b>   {}</a>', + process.machine.id, + str(process.machine.id)[:8], + process.machine.hostname, + ) + + @admin.display(description="Binary", ordering="binary__name") + def binary_info(self, process): + if not process.binary: + return "-" + return format_html( + '<a href="/admin/machine/binary/{}/change"><code>{}</code> v{}</a>', + process.binary.id, + process.binary.name, + process.binary.version, + ) + + @admin.display(description="Binary", ordering="binary__name") + def binary_link(self, process): + return self.binary_info(process) + + @admin.display(description="Network Interface", ordering="iface__id") + def iface_link(self, process): + if not process.iface: + return "-" + return format_html( + '<a href="/admin/machine/networkinterface/{}/change"><code>{}</code> {}</a>', + process.iface.id, + str(process.iface.id)[:8], + process.iface.iface or process.iface.ip_public or process.iface.ip_local, + ) + + @admin.display(description="ArchiveResult", ordering="archiveresult__plugin") + def archiveresult_link(self, process): + if not hasattr(process, "archiveresult"): + return "-" + ar = process.archiveresult + return format_html( + '<a href="/admin/core/archiveresult/{}/change">{} ← <code>{}</code></a>', + ar.id, + ar.snapshot.url[:50], + ar.plugin, + ) + + @admin.display(description="Snapshot", ordering="archiveresult__snapshot__id") + def snapshot_link(self, process): + ar = getattr(process, "archiveresult", None) + snapshot = getattr(ar, "snapshot", None) + if not snapshot: + return "-" + return format_html( + '<a href="/admin/core/snapshot/{}/change"><code>{}</code></a>', + snapshot.id, + str(snapshot.id)[:8], + ) + + @admin.display(description="Crawl", ordering="archiveresult__snapshot__crawl__id") + def crawl_link(self, process): + ar = getattr(process, "archiveresult", None) + snapshot = getattr(ar, "snapshot", None) + crawl = getattr(snapshot, "crawl", None) + if not crawl: + return "-" + return format_html( + '<a href="/admin/crawls/crawl/{}/change"><code>{}</code></a>', + crawl.id, + str(crawl.id)[:8], + ) + + @admin.display(description="Command", ordering="cmd") + def cmd_str(self, process): + if not process.cmd: + return "-" + cmd = " ".join(process.cmd[:3]) if isinstance(process.cmd, list) else str(process.cmd) + if len(process.cmd) > 3: + cmd += " ..." + return format_html('<code style="font-size: 0.9em;">{}</code>', cmd[:80]) + + @admin.display(description="Duration", ordering="runtime_sort") + def duration_display(self, process): + return _format_process_duration_seconds(process.started_at, process.ended_at) + + @admin.display(description="Output", ordering="archiveresult__output_size") + def output_summary(self, process): + output_files = getattr(getattr(process, "archiveresult", None), "output_files", {}) or {} + + if isinstance(output_files, str): + try: + output_files = json.loads(output_files) + except Exception: + output_files = {} + + file_count = 0 + total_bytes = 0 + + if isinstance(output_files, dict): + file_count = len(output_files) + items = output_files.values() + elif isinstance(output_files, (list, tuple, set)): + file_count = len(output_files) + items = output_files + else: + items = () + + for metadata in items: + if not isinstance(metadata, dict): + continue + size = metadata.get("size", 0) + try: + total_bytes += int(size or 0) + except (TypeError, ValueError): + continue + + file_label = "file" if file_count == 1 else "files" + return format_html( + '<code style="font-size: 0.9em;">{} {} â€ĸ {}</code>', + file_count, + file_label, + printable_filesize(total_bytes), + ) + + @admin.display(description="Command") + def cmd_display(self, process): + if not process.cmd: + return "-" + if isinstance(process.cmd, list): + cmd = shlex.join(str(arg) for arg in process.cmd) + else: + cmd = str(process.cmd) + return _render_copy_block(cmd) + + @admin.display(description="Environment") + def env_display(self, process): + env_text = env_to_dotenv_text(process.env) + if not env_text: + return "-" + return _render_copy_block(env_text, multiline=True) + + +def register_admin(admin_site): + admin_site.register(Machine, MachineAdmin) + admin_site.register(NetworkInterface, NetworkInterfaceAdmin) + admin_site.register(Binary, BinaryAdmin) + admin_site.register(Process, ProcessAdmin) diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py new file mode 100644 index 0000000000..f4834e4c97 --- /dev/null +++ b/archivebox/machine/apps.py @@ -0,0 +1,25 @@ +__package__ = "archivebox.machine" + +from django.apps import AppConfig + + +class MachineConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + + name = "archivebox.machine" + label = "machine" # Explicit label for migrations + verbose_name = "Machine Info" + + def ready(self): + """Import models to register state machines with the registry""" + import sys + + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.machine import models # noqa: F401 + + +def register_admin(admin_site): + from archivebox.machine.admin import register_admin + + register_admin(admin_site) diff --git a/archivebox/machine/detect.py b/archivebox/machine/detect.py new file mode 100644 index 0000000000..c3960f146d --- /dev/null +++ b/archivebox/machine/detect.py @@ -0,0 +1,339 @@ +import os +import json +import socket +import urllib.request +from typing import Any +from pathlib import Path +import subprocess +import platform +import tempfile +from datetime import datetime + +import psutil +import machineid # https://github.com/keygen-sh/py-machineid + +from rich import print + +PACKAGE_DIR = Path(__file__).parent +DATA_DIR = Path(os.getcwd()).resolve() + + +def get_vm_info(): + hw_in_docker = bool(os.getenv("IN_DOCKER", False) in ("1", "true", "True", "TRUE")) + hw_in_vm = False + try: + # check for traces of docker/containerd/podman in cgroup + with open("/proc/self/cgroup") as procfile: + for line in procfile: + cgroup = line.strip() # .split('/', 1)[-1].lower() + if "docker" in cgroup or "containerd" in cgroup or "podman" in cgroup: + hw_in_docker = True + except Exception: + pass + + hw_manufacturer = "Docker" if hw_in_docker else "Unknown" + hw_product = "Container" if hw_in_docker else "Unknown" + hw_uuid = machineid.id() + + if platform.system().lower() == "darwin": + # Get macOS machine info + hw_manufacturer = "Apple" + hw_product = "Mac" + try: + # Hardware: + # Hardware Overview: + # Model Name: Mac Studio + # Model Identifier: Mac13,1 + # Model Number: MJMV3LL/A + # ... + # Serial Number (system): M230YYTD77 + # Hardware UUID: 39A12B50-1972-5910-8BEE-235AD20C8EE3 + # ... + result = subprocess.run(["system_profiler", "SPHardwareDataType"], capture_output=True, text=True, check=True) + for line in result.stdout.split("\n"): + if "Model Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "Model Identifier:" in line: + hw_product += " " + line.split(":", 1)[-1].strip() + elif "Hardware UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() + except Exception: + pass + else: + # get Linux machine info + try: + # Getting SMBIOS data from sysfs. + # SMBIOS 2.8 present. + # argo-1 | 2024-10-01T10:40:51Z ERR error="Incoming request ended abruptly: context canceled" connIndex=2 event=1 ingressRule=0 originService=http://archivebox:8000 │ + # Handle 0x0100, DMI type 1, 27 bytes + # System Information + # Manufacturer: DigitalOcean + # Product Name: Droplet + # Serial Number: 411922099 + # UUID: fb65f41c-ec24-4539-beaf-f941903bdb2c + # ... + # Family: DigitalOcean_Droplet + dmidecode = subprocess.run(["dmidecode", "-t", "system"], capture_output=True, text=True, check=True) + for line in dmidecode.stdout.split("\n"): + if "Manufacturer:" in line: + hw_manufacturer = line.split(":", 1)[-1].strip() + elif "Product Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() + except Exception: + pass + + # Check for VM fingerprint in manufacturer/product name + if "qemu" in hw_product.lower() or "vbox" in hw_product.lower() or "lxc" in hw_product.lower() or "vm" in hw_product.lower(): + hw_in_vm = True + + # Check for QEMU explicitly in pmap output + try: + result = subprocess.run(["pmap", "1"], capture_output=True, text=True, check=True) + if "qemu" in result.stdout.lower(): + hw_in_vm = True + except Exception: + pass + + return { + "hw_in_docker": hw_in_docker, + "hw_in_vm": hw_in_vm, + "hw_manufacturer": hw_manufacturer, + "hw_product": hw_product, + "hw_uuid": hw_uuid, + } + + +def get_public_ip() -> str: + def fetch_url(url: str) -> str: + with urllib.request.urlopen(url, timeout=5) as response: + return response.read().decode("utf-8").strip() + + def fetch_dns(pubip_lookup_host: str) -> str: + return socket.gethostbyname(pubip_lookup_host).strip() + + methods = [ + (lambda: fetch_url("https://ipinfo.io/ip"), lambda r: r), + (lambda: fetch_url("https://api.ipify.org?format=json"), lambda r: json.loads(r)["ip"]), + (lambda: fetch_dns("myip.opendns.com"), lambda r: r), + (lambda: fetch_url("http://whatismyip.akamai.com/"), lambda r: r), # try HTTP as final fallback in case of TLS/system time errors + ] + + for fetch, parse in methods: + try: + result = parse(fetch()) + if result: + return result + except Exception: + continue + + raise Exception("Could not determine public IP address") + + +def get_local_ip(remote_ip: str = "1.1.1.1", remote_port: int = 80) -> str: + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect((remote_ip, remote_port)) + return s.getsockname()[0] + except Exception: + pass + return "127.0.0.1" + + +ip_addrs = lambda addrs: (a for a in addrs if a.family == socket.AF_INET) +mac_addrs = lambda addrs: (a for a in addrs if a.family == psutil.AF_LINK) + + +def get_isp_info(ip=None): + # Get public IP + try: + ip = ip or urllib.request.urlopen("https://api.ipify.org").read().decode("utf8") + except Exception: + pass + + # Get ISP name, city, and country + data = {} + try: + url = f"https://ipapi.co/{ip}/json/" + response = urllib.request.urlopen(url) + data = json.loads(response.read().decode()) + except Exception: + pass + + isp = data.get("org", "Unknown") + city = data.get("city", "Unknown") + region = data.get("region", "Unknown") + country = data.get("country_name", "Unknown") + + # Get system DNS resolver servers + dns_server = None + try: + result = subprocess.run(["dig", "example.com", "A"], capture_output=True, text=True, check=True).stdout + dns_server = result.split(";; SERVER: ", 1)[-1].split("\n")[0].split("#")[0].strip() + except Exception: + try: + dns_server = Path("/etc/resolv.conf").read_text().split("nameserver ", 1)[-1].split("\n")[0].strip() + except Exception: + dns_server = "127.0.0.1" + print(f"[red]:warning: WARNING: Could not determine DNS server, using {dns_server}[/red]") + + # Get DNS resolver's ISP name + # url = f'https://ipapi.co/{dns_server}/json/' + # dns_isp = json.loads(urllib.request.urlopen(url).read().decode()).get('org', 'Unknown') + + return { + "isp": isp, + "city": city, + "region": region, + "country": country, + "dns_server": dns_server, + # 'net_dns_isp': dns_isp, + } + + +def get_host_network() -> dict[str, Any]: + default_gateway_local_ip = get_local_ip() + gateways = psutil.net_if_addrs() + + for interface, ips in gateways.items(): + for local_ip in ip_addrs(ips): + if default_gateway_local_ip == local_ip.address: + mac_address = next(mac_addrs(ips)).address + public_ip = get_public_ip() + return { + "hostname": max([socket.gethostname(), platform.node()], key=len), + "iface": interface, + "mac_address": mac_address, + "ip_local": local_ip.address, + "ip_public": public_ip, + # "is_behind_nat": local_ip.address != public_ip, + **get_isp_info(public_ip), + } + + raise Exception("Could not determine host network info") + + +def get_os_info() -> dict[str, Any]: + os_release = platform.release() + if platform.system().lower() == "darwin": + os_release = "macOS " + platform.mac_ver()[0] + else: + try: + os_release = subprocess.run(["lsb_release", "-ds"], capture_output=True, text=True, check=True).stdout.strip() + except Exception: + pass + + return { + "os_arch": platform.machine(), + "os_family": platform.system().lower(), + "os_platform": platform.platform(), + "os_kernel": platform.version(), + "os_release": os_release, + } + + +def get_host_stats() -> dict[str, Any]: + try: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_usage = psutil.disk_usage(str(tmp_dir)) + app_usage = psutil.disk_usage(str(PACKAGE_DIR)) + data_usage = psutil.disk_usage(str(DATA_DIR)) + mem_usage = psutil.virtual_memory() + try: + swap_usage = psutil.swap_memory() + swap_used_pct = swap_usage.percent + swap_used_gb = round(swap_usage.used / 1024 / 1024 / 1024, 3) + swap_free_gb = round(swap_usage.free / 1024 / 1024 / 1024, 3) + except OSError: + # Some sandboxed environments deny access to swap stats + swap_used_pct = 0.0 + swap_used_gb = 0.0 + swap_free_gb = 0.0 + return { + "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(), + "cpu_count": psutil.cpu_count(logical=False), + "cpu_load": psutil.getloadavg(), + # "cpu_pct": psutil.cpu_percent(interval=1), + "mem_virt_used_pct": mem_usage.percent, + "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3), + "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3), + "mem_swap_used_pct": swap_used_pct, + "mem_swap_used_gb": swap_used_gb, + "mem_swap_free_gb": swap_free_gb, + "disk_tmp_used_pct": tmp_usage.percent, + "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3), + "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB + "disk_app_used_pct": app_usage.percent, + "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3), + "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3), + "disk_data_used_pct": data_usage.percent, + "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3), + "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3), + } + except Exception: + return {} + + +def get_host_immutable_info(host_info: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in host_info.items() if key in ["guid", "net_mac", "os_family", "cpu_arch"]} + + +def get_host_guid() -> str: + return machineid.hashed_id("archivebox") + + +# Example usage +if __name__ == "__main__": + host_info = { + "guid": get_host_guid(), + "os": get_os_info(), + "vm": get_vm_info(), + "net": get_host_network(), + "stats": get_host_stats(), + } + print(host_info) + +# { +# 'guid': '1cd2dd279f8a854...6943f2384437991a', +# 'os': { +# 'os_arch': 'arm64', +# 'os_family': 'darwin', +# 'os_platform': 'macOS-14.6.1-arm64-arm-64bit', +# 'os_kernel': 'Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000', +# 'os_release': 'macOS 14.6.1' +# }, +# 'vm': {'hw_in_docker': False, 'hw_in_vm': False, 'hw_manufacturer': 'Apple', 'hw_product': 'Mac Studio Mac13,1', 'hw_uuid': '39A12B50-...-...-...-...'}, +# 'net': { +# 'hostname': 'somehost.sub.example.com', +# 'iface': 'en0', +# 'mac_address': 'ab:cd:ef:12:34:56', +# 'ip_local': '192.168.2.18', +# 'ip_public': '123.123.123.123', +# 'isp': 'AS-SONICTELECOM', +# 'city': 'Berkeley', +# 'region': 'California', +# 'country': 'United States', +# 'dns_server': '192.168.1.1' +# }, +# 'stats': { +# 'cpu_boot_time': '2024-09-24T21:20:16', +# 'cpu_count': 10, +# 'cpu_load': (2.35693359375, 4.013671875, 4.1171875), +# 'mem_virt_used_pct': 66.0, +# 'mem_virt_used_gb': 15.109, +# 'mem_virt_free_gb': 0.065, +# 'mem_swap_used_pct': 89.4, +# 'mem_swap_used_gb': 8.045, +# 'mem_swap_free_gb': 0.955, +# 'disk_tmp_used_pct': 26.0, +# 'disk_tmp_used_gb': 113.1, +# 'disk_tmp_free_gb': 322.028, +# 'disk_app_used_pct': 56.1, +# 'disk_app_used_gb': 2138.796, +# 'disk_app_free_gb': 1675.996, +# 'disk_data_used_pct': 56.1, +# 'disk_data_used_gb': 2138.796, +# 'disk_data_free_gb': 1675.996 +# } +# } diff --git a/archivebox/machine/env_utils.py b/archivebox/machine/env_utils.py new file mode 100644 index 0000000000..9ab2d5f8fc --- /dev/null +++ b/archivebox/machine/env_utils.py @@ -0,0 +1,39 @@ +__package__ = "archivebox.machine" + +import json +import shlex +from typing import Any + + +SENSITIVE_ENV_KEY_PARTS = ("KEY", "TOKEN", "SECRET") + + +def stringify_env_value(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, bool): + return "True" if value else "False" + return json.dumps(value, separators=(",", ":")) + + +def is_redacted_env_key(key: str) -> bool: + upper_key = str(key or "").upper() + return any(part in upper_key for part in SENSITIVE_ENV_KEY_PARTS) + + +def redact_env(env: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(env, dict): + return {} + return {str(key): value for key, value in env.items() if key is not None and not is_redacted_env_key(str(key))} + + +def env_to_dotenv_text(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return "\n".join(f"{key}={shlex.quote(stringify_env_value(value))}" for key, value in sorted(redacted_env.items()) if value is not None) + + +def env_to_shell_exports(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return " ".join(f"{key}={shlex.quote(stringify_env_value(value))}" for key, value in sorted(redacted_env.items()) if value is not None) diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py new file mode 100644 index 0000000000..2a0f018cd5 --- /dev/null +++ b/archivebox/machine/migrations/0001_initial.py @@ -0,0 +1,247 @@ +# Generated by hand on 2025-12-29 +# Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create machine_machine table + CREATE TABLE IF NOT EXISTS machine_machine ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + guid VARCHAR(64) NOT NULL UNIQUE, + hostname VARCHAR(63) NOT NULL, + hw_in_docker BOOLEAN NOT NULL DEFAULT 0, + hw_in_vm BOOLEAN NOT NULL DEFAULT 0, + hw_manufacturer VARCHAR(63) NOT NULL, + hw_product VARCHAR(63) NOT NULL, + hw_uuid VARCHAR(255) NOT NULL, + + os_arch VARCHAR(15) NOT NULL, + os_family VARCHAR(15) NOT NULL, + os_platform VARCHAR(63) NOT NULL, + os_release VARCHAR(63) NOT NULL, + os_kernel VARCHAR(255) NOT NULL, + + stats TEXT, + config TEXT + ); + CREATE INDEX IF NOT EXISTS machine_machine_guid_idx ON machine_machine(guid); + + -- Create machine_networkinterface table + CREATE TABLE IF NOT EXISTS machine_networkinterface ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + machine_id TEXT NOT NULL, + iface VARCHAR(15) NOT NULL, + ip_public VARCHAR(39) NOT NULL, + ip_local VARCHAR(39) NOT NULL, + mac_address VARCHAR(17) NOT NULL, + dns_server VARCHAR(39) NOT NULL, + hostname VARCHAR(256) NOT NULL, + isp VARCHAR(256) NOT NULL, + city VARCHAR(100) NOT NULL, + region VARCHAR(100) NOT NULL, + country VARCHAR(100) NOT NULL, + + FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS machine_networkinterface_machine_id_idx ON machine_networkinterface(machine_id); + + -- Create machine_binary table + CREATE TABLE IF NOT EXISTS machine_binary ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + machine_id TEXT NOT NULL, + name VARCHAR(63) NOT NULL, + binproviders VARCHAR(127) NOT NULL DEFAULT 'env', + overrides TEXT NOT NULL DEFAULT '{}', + + binprovider VARCHAR(31) NOT NULL DEFAULT '', + abspath VARCHAR(255) NOT NULL DEFAULT '', + version VARCHAR(32) NOT NULL DEFAULT '', + sha256 VARCHAR(64) NOT NULL DEFAULT '', + + status VARCHAR(16) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + output_dir VARCHAR(255) NOT NULL DEFAULT '', + + FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE, + UNIQUE(machine_id, name, abspath, version, sha256) + ); + CREATE INDEX IF NOT EXISTS machine_binary_machine_id_idx ON machine_binary(machine_id); + CREATE INDEX IF NOT EXISTS machine_binary_name_idx ON machine_binary(name); + CREATE INDEX IF NOT EXISTS machine_binary_status_idx ON machine_binary(status); + CREATE INDEX IF NOT EXISTS machine_binary_retry_at_idx ON machine_binary(retry_at); + + """, + reverse_sql=""" + DROP TABLE IF EXISTS machine_binary; + DROP TABLE IF EXISTS machine_networkinterface; + DROP TABLE IF EXISTS machine_machine; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="Machine", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("guid", models.CharField(default=None, editable=False, max_length=64, unique=True)), + ("hostname", models.CharField(default=None, max_length=63)), + ("hw_in_docker", models.BooleanField(default=False)), + ("hw_in_vm", models.BooleanField(default=False)), + ("hw_manufacturer", models.CharField(default=None, max_length=63)), + ("hw_product", models.CharField(default=None, max_length=63)), + ("hw_uuid", models.CharField(default=None, max_length=255)), + ("os_arch", models.CharField(default=None, max_length=15)), + ("os_family", models.CharField(default=None, max_length=15)), + ("os_platform", models.CharField(default=None, max_length=63)), + ("os_release", models.CharField(default=None, max_length=63)), + ("os_kernel", models.CharField(default=None, max_length=255)), + ("stats", models.JSONField(blank=True, default=dict, null=True)), + ( + "config", + models.JSONField( + blank=True, + default=dict, + help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)", + null=True, + ), + ), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "app_label": "machine", + }, + ), + migrations.CreateModel( + name="NetworkInterface", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("mac_address", models.CharField(default=None, editable=False, max_length=17)), + ("ip_public", models.GenericIPAddressField(default=None, editable=False)), + ("ip_local", models.GenericIPAddressField(default=None, editable=False)), + ("dns_server", models.GenericIPAddressField(default=None, editable=False)), + ("hostname", models.CharField(default=None, max_length=63)), + ("iface", models.CharField(default=None, max_length=15)), + ("isp", models.CharField(default=None, max_length=63)), + ("city", models.CharField(default=None, max_length=63)), + ("region", models.CharField(default=None, max_length=63)), + ("country", models.CharField(default=None, max_length=63)), + ("machine", models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "unique_together": {("machine", "ip_public", "ip_local", "mac_address", "dns_server")}, + "app_label": "machine", + }, + ), + migrations.CreateModel( + name="Binary", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(blank=True, db_index=True, default="", max_length=63)), + ( + "binproviders", + models.CharField( + blank=True, + default="env", + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + max_length=127, + ), + ), + ( + "overrides", + models.JSONField( + blank=True, + default=dict, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ), + ), + ( + "binprovider", + models.CharField( + blank=True, + default="", + help_text="Provider that successfully installed this binary", + max_length=31, + ), + ), + ("abspath", models.CharField(blank=True, default="", max_length=255)), + ("version", models.CharField(blank=True, default="", max_length=32)), + ("sha256", models.CharField(blank=True, default="", max_length=64)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("succeeded", "Succeeded"), ("failed", "Failed")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this binary installation", + null=True, + ), + ), + ( + "output_dir", + models.CharField( + blank=True, + default="", + help_text="Directory where installation hook logs are stored", + max_length=255, + ), + ), + ("machine", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "verbose_name": "Binary", + "verbose_name_plural": "Binaries", + "unique_together": {("machine", "name", "abspath", "version", "sha256")}, + "app_label": "machine", + }, + ), + ], + ), + ] diff --git a/archivebox/machine/migrations/0005_converge_binary_model.py b/archivebox/machine/migrations/0005_converge_binary_model.py new file mode 100644 index 0000000000..593d9637cf --- /dev/null +++ b/archivebox/machine/migrations/0005_converge_binary_model.py @@ -0,0 +1,71 @@ +# Generated by hand on 2026-01-01 +# Converges machine app for 0.8.6rc0 → 0.9.x migration path +# Drops old Binary table and ensures Binary table exists + +from django.db import migrations, connection + + +def converge_binary_table(apps, schema_editor): + """ + Drop machine_installedbinary if it exists (0.8.6rc0 path). + Create machine_binary if it doesn't exist (needed by Process model). + """ + cursor = connection.cursor() + + # Check what tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')") + existing_tables = {row[0] for row in cursor.fetchall()} + + print(f"DEBUG 0005: Existing tables: {existing_tables}") + + # Drop old Binary table if it exists (0.8.6rc0 path) + if "machine_installedbinary" in existing_tables: + print("✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)") + cursor.execute("DROP TABLE IF EXISTS machine_installedbinary") + + # Create Binary table if it doesn't exist + # This handles the case where 0.8.6rc0's 0001_initial didn't create it + if "machine_binary" not in existing_tables: + print("✓ Creating machine_binary table with correct schema") + cursor.execute(""" + CREATE TABLE machine_binary ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE, + name VARCHAR(63) NOT NULL, + binproviders VARCHAR(255) NOT NULL DEFAULT 'env', + overrides TEXT NOT NULL DEFAULT '{}', + binprovider VARCHAR(63) NOT NULL DEFAULT 'env', + abspath VARCHAR(255) NOT NULL, + version VARCHAR(128) NOT NULL, + sha256 VARCHAR(64) NOT NULL DEFAULT '', + status VARCHAR(16) NOT NULL DEFAULT 'succeeded', + retry_at DATETIME NULL, + output_dir VARCHAR(255) NOT NULL DEFAULT '' + ) + """) + + # Create indexes + cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)") + cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)") + cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)") + + print("✓ machine_binary table created") + else: + print("✓ machine_binary table already exists") + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0001_initial"), + ] + + operations = [ + migrations.RunPython( + converge_binary_table, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/machine/migrations/0006_process.py b/archivebox/machine/migrations/0006_process.py new file mode 100644 index 0000000000..eb23475661 --- /dev/null +++ b/archivebox/machine/migrations/0006_process.py @@ -0,0 +1,102 @@ +# Generated by Django 6.0 on 2025-12-31 22:54 + +import django.db.models.deletion +import django.utils.timezone +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0005_converge_binary_model"), + ] + + operations = [ + migrations.CreateModel( + name="Process", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("pwd", models.CharField(blank=True, default="", help_text="Working directory for process execution", max_length=512)), + ("cmd", models.JSONField(blank=True, default=list, help_text="Command as array of arguments")), + ("env", models.JSONField(blank=True, default=dict, help_text="Environment variables for process")), + ("timeout", models.IntegerField(default=120, help_text="Timeout in seconds")), + ("pid", models.IntegerField(blank=True, default=None, help_text="OS process ID", null=True)), + ("exit_code", models.IntegerField(blank=True, default=None, help_text="Process exit code (0 = success)", null=True)), + ("stdout", models.TextField(blank=True, default="", help_text="Standard output from process")), + ("stderr", models.TextField(blank=True, default="", help_text="Standard error from process")), + ("started_at", models.DateTimeField(blank=True, default=None, help_text="When process was launched", null=True)), + ("ended_at", models.DateTimeField(blank=True, default=None, help_text="When process completed/terminated", null=True)), + ( + "url", + models.URLField( + blank=True, + default=None, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + max_length=2048, + null=True, + ), + ), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("running", "Running"), ("exited", "Exited")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this process", + null=True, + ), + ), + ( + "binary", + models.ForeignKey( + blank=True, + help_text="Binary used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.binary", + ), + ), + ( + "iface", + models.ForeignKey( + blank=True, + help_text="Network interface used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.networkinterface", + ), + ), + ( + "machine", + models.ForeignKey( + help_text="Machine where this process executed", + on_delete=django.db.models.deletion.CASCADE, + related_name="process_set", + to="machine.machine", + ), + ), + ], + options={ + "verbose_name": "Process", + "verbose_name_plural": "Processes", + "indexes": [ + models.Index(fields=["machine", "status", "retry_at"], name="machine_pro_machine_5e3a87_idx"), + models.Index(fields=["binary", "exit_code"], name="machine_pro_binary__7bd19c_idx"), + ], + }, + ), + ] diff --git a/archivebox/machine/migrations/0007_add_process_type_and_parent.py b/archivebox/machine/migrations/0007_add_process_type_and_parent.py new file mode 100644 index 0000000000..9d81a773a6 --- /dev/null +++ b/archivebox/machine/migrations/0007_add_process_type_and_parent.py @@ -0,0 +1,42 @@ +# Generated by Django 6.0 on 2026-01-01 22:55 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0006_process"), + ] + + operations = [ + migrations.AddField( + model_name="process", + name="parent", + field=models.ForeignKey( + blank=True, + help_text="Parent process that spawned this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="children", + to="machine.process", + ), + ), + migrations.AddField( + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0008_add_worker_type_field.py b/archivebox/machine/migrations/0008_add_worker_type_field.py new file mode 100644 index 0000000000..905870cf18 --- /dev/null +++ b/archivebox/machine/migrations/0008_add_worker_type_field.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0 on 2026-01-02 03:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + migrations.AddField( + model_name="process", + name="worker_type", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)", + max_length=32, + ), + ), + ] diff --git a/archivebox/machine/migrations/0009_alter_binary_status.py b/archivebox/machine/migrations/0009_alter_binary_status.py new file mode 100644 index 0000000000..bbc27598ac --- /dev/null +++ b/archivebox/machine/migrations/0009_alter_binary_status.py @@ -0,0 +1,22 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0008_add_worker_type_field"), + ] + + operations = [ + migrations.AlterField( + model_name="binary", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("installed", "Installed")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0010_alter_process_process_type.py b/archivebox/machine/migrations/0010_alter_process_process_type.py new file mode 100644 index 0000000000..477ea353ed --- /dev/null +++ b/archivebox/machine/migrations/0010_alter_process_process_type.py @@ -0,0 +1,30 @@ +# Generated by Django 6.0 on 2026-01-03 06:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0009_alter_binary_status"), + ] + + operations = [ + migrations.AlterField( + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("hook", "Hook"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0011_remove_binary_output_dir.py b/archivebox/machine/migrations/0011_remove_binary_output_dir.py new file mode 100644 index 0000000000..0a24dff11d --- /dev/null +++ b/archivebox/machine/migrations/0011_remove_binary_output_dir.py @@ -0,0 +1,33 @@ +from django.db import migrations + + +def remove_output_dir_if_exists(apps, schema_editor): + cursor = schema_editor.connection.cursor() + cursor.execute("PRAGMA table_info(machine_binary)") + columns = {row[1] for row in cursor.fetchall()} + + if "output_dir" not in columns: + return + + Binary = apps.get_model("machine", "Binary") + schema_editor.remove_field(Binary, Binary._meta.get_field("output_dir")) + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0010_alter_process_process_type"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython(remove_output_dir_if_exists, migrations.RunPython.noop), + ], + state_operations=[ + migrations.RemoveField( + model_name="binary", + name="output_dir", + ), + ], + ), + ] diff --git a/archivebox/machine/migrations/__init__.py b/archivebox/machine/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py new file mode 100755 index 0000000000..10444b36af --- /dev/null +++ b/archivebox/machine/models.py @@ -0,0 +1,2570 @@ +from __future__ import annotations + +__package__ = "archivebox.machine" + +import os +import sys +import uuid +import socket +from pathlib import Path +from archivebox.uuid_compat import uuid7 +from datetime import timedelta, datetime +from typing import TYPE_CHECKING, Any, cast + +from statemachine import State, registry + +from django.db import models +from django.db.models import QuerySet +from django.utils import timezone +from django.utils.functional import cached_property +from django_stubs_ext.db.models import TypedModelMeta + +from archivebox.base_models.models import ModelWithHealthStats +from archivebox.workers.models import BaseStateMachine, ModelWithStateMachine +from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats + +_psutil: Any | None = None +try: + import psutil as _psutil_import + + PSUTIL_AVAILABLE = True +except ImportError: + PSUTIL_AVAILABLE = False +else: + _psutil = _psutil_import + +if TYPE_CHECKING: + import psutil + from archivebox.core.models import ArchiveResult +else: + psutil = cast(Any, _psutil) + +_CURRENT_MACHINE: Machine | None = None +_CURRENT_INTERFACE: NetworkInterface | None = None +_CURRENT_BINARIES: dict[str, Binary] = {} +_CURRENT_PROCESS: Process | None = None + +MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 +NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 +BINARY_RECHECK_INTERVAL = 1 * 30 * 60 +PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds +PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid +PROCESS_TIMEOUT_GRACE = timedelta(seconds=30) # Extra margin before force-cleaning timed-out RUNNING rows +START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching +LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"}) + + +def _find_existing_binary_for_reference(machine: Machine, reference: str) -> Binary | None: + reference = str(reference or "").strip() + if not reference: + return None + + qs = Binary.objects.filter(machine=machine) + + direct_match = qs.filter(abspath=reference).order_by("-modified_at").first() + if direct_match: + return direct_match + + ref_name = Path(reference).name + if ref_name: + named_match = qs.filter(name=ref_name).order_by("-modified_at").first() + if named_match: + return named_match + + return qs.filter(name=reference).order_by("-modified_at").first() + + +def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]: + env = env or {} + plugin_name = str(plugin_name or "").strip() + hook_path = str(hook_path or "").strip() + plugin_key = plugin_name.upper().replace("-", "_") + keys: list[str] = [] + seen: set[str] = set() + + def add(key: str) -> None: + if key and key not in seen and env.get(key): + seen.add(key) + keys.append(key) + + if plugin_key: + add(f"{plugin_key}_BINARY") + + try: + from archivebox.hooks import discover_plugin_configs + + plugin_schema = discover_plugin_configs().get(plugin_name, {}) + schema_keys = [key for key in (plugin_schema.get("properties") or {}) if key.endswith("_BINARY")] + except Exception: + schema_keys = [] + + schema_keys.sort( + key=lambda key: ( + key != f"{plugin_key}_BINARY", + key, + ), + ) + for key in schema_keys: + add(key) + + if plugin_name.startswith("search_backend_"): + backend_name = plugin_name.removeprefix("search_backend_").upper().replace("-", "_") + configured_engine = str(env.get("SEARCH_BACKEND_ENGINE") or "").strip().upper().replace("-", "_") + if backend_name and backend_name == configured_engine: + add(f"{backend_name}_BINARY") + + hook_suffix = Path(hook_path).suffix.lower() + if hook_suffix == ".js": + add("NODE_BINARY") + + return keys + + +def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(config, dict): + return {} + + sanitized = dict(config) + for key in LEGACY_MACHINE_CONFIG_KEYS: + sanitized.pop(key, None) + return sanitized + + +class MachineManager(models.Manager): + def current(self) -> Machine: + return Machine.current() + + +class Machine(ModelWithHealthStats): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False) + hostname = models.CharField(max_length=63, default=None, null=False) + hw_in_docker = models.BooleanField(default=False, null=False) + hw_in_vm = models.BooleanField(default=False, null=False) + hw_manufacturer = models.CharField(max_length=63, default=None, null=False) + hw_product = models.CharField(max_length=63, default=None, null=False) + hw_uuid = models.CharField(max_length=255, default=None, null=False) + os_arch = models.CharField(max_length=15, default=None, null=False) + os_family = models.CharField(max_length=15, default=None, null=False) + os_platform = models.CharField(max_length=63, default=None, null=False) + os_release = models.CharField(max_length=63, default=None, null=False) + os_kernel = models.CharField(max_length=255, default=None, null=False) + stats = models.JSONField(default=dict, null=True, blank=True) + config = models.JSONField( + default=dict, + null=True, + blank=True, + help_text="Machine-specific config overrides.", + ) + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride] + networkinterface_set: models.Manager[NetworkInterface] + + class Meta(ModelWithHealthStats.Meta): + app_label = "machine" + + @classmethod + def current(cls) -> Machine: + global _CURRENT_MACHINE + if _CURRENT_MACHINE: + if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL): + if not cls.objects.filter(id=_CURRENT_MACHINE.id).exists(): + _CURRENT_MACHINE = None + else: + return cls._sanitize_config(_CURRENT_MACHINE) + else: + _CURRENT_MACHINE = None + _CURRENT_MACHINE, _ = cls.objects.update_or_create( + guid=get_host_guid(), + defaults={"hostname": socket.gethostname(), **get_os_info(), **get_vm_info(), "stats": get_host_stats()}, + ) + return cls._sanitize_config(_CURRENT_MACHINE) + + @classmethod + def _sanitize_config(cls, machine: Machine) -> Machine: + sanitized = _sanitize_machine_config(machine.config) + current = machine.config or {} + if sanitized != current: + machine.config = sanitized + machine.save(update_fields=["config", "modified_at"]) + return machine + + def to_json(self) -> dict: + """ + Convert Machine model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + return { + "type": "Machine", + "schema_version": VERSION, + "id": str(self.id), + "guid": self.guid, + "hostname": self.hostname, + "hw_in_docker": self.hw_in_docker, + "hw_in_vm": self.hw_in_vm, + "hw_manufacturer": self.hw_manufacturer, + "hw_product": self.hw_product, + "hw_uuid": self.hw_uuid, + "os_arch": self.os_arch, + "os_family": self.os_family, + "os_platform": self.os_platform, + "os_kernel": self.os_kernel, + "os_release": self.os_release, + "stats": self.stats, + "config": self.config or {}, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Update Machine config from JSON dict. + + Args: + record: JSON dict with 'config': {key: value} patch + overrides: Not used + + Returns: + Machine instance or None + """ + config_patch = _sanitize_machine_config(record.get("config")) + if config_patch: + machine = Machine.current() + machine.config = _sanitize_machine_config(machine.config) + machine.config.update(config_patch) + machine.save(update_fields=["config"]) + return machine + return None + + +class NetworkInterfaceManager(models.Manager): + def current(self) -> NetworkInterface: + return NetworkInterface.current() + + +class NetworkInterface(ModelWithHealthStats): + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False) + mac_address = models.CharField(max_length=17, default=None, null=False, editable=False) + ip_public = models.GenericIPAddressField(default=None, null=False, editable=False) + ip_local = models.GenericIPAddressField(default=None, null=False, editable=False) + dns_server = models.GenericIPAddressField(default=None, null=False, editable=False) + hostname = models.CharField(max_length=63, default=None, null=False) + iface = models.CharField(max_length=15, default=None, null=False) + isp = models.CharField(max_length=63, default=None, null=False) + city = models.CharField(max_length=63, default=None, null=False) + region = models.CharField(max_length=63, default=None, null=False) + country = models.CharField(max_length=63, default=None, null=False) + # num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats + # num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats + + objects = NetworkInterfaceManager() # pyright: ignore[reportIncompatibleVariableOverride] + machine_id: uuid.UUID + + class Meta(ModelWithHealthStats.Meta): + app_label = "machine" + unique_together = (("machine", "ip_public", "ip_local", "mac_address", "dns_server"),) + + @classmethod + def current(cls, refresh: bool = False) -> NetworkInterface: + global _CURRENT_INTERFACE + machine = Machine.current() + if _CURRENT_INTERFACE: + if ( + not refresh + and _CURRENT_INTERFACE.machine_id == machine.id + and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL) + ): + return _CURRENT_INTERFACE + _CURRENT_INTERFACE = None + net_info = get_host_network() + _CURRENT_INTERFACE, _ = cls.objects.update_or_create( + machine=machine, + ip_public=net_info.pop("ip_public"), + ip_local=net_info.pop("ip_local"), + mac_address=net_info.pop("mac_address"), + dns_server=net_info.pop("dns_server"), + defaults=net_info, + ) + return _CURRENT_INTERFACE + + +class BinaryManager(models.Manager): + def get_from_db_or_cache(self, name: str, abspath: str = "", version: str = "", sha256: str = "", binprovider: str = "env") -> Binary: + """Get or create an Binary record from the database or cache.""" + cached = _CURRENT_BINARIES.get(name) + if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL): + return cached + _CURRENT_BINARIES[name], _ = self.update_or_create( + machine=Machine.current(), + name=name, + binprovider=binprovider, + version=version, + abspath=abspath, + sha256=sha256, + ) + return _CURRENT_BINARIES[name] + + def get_valid_binary(self, name: str, machine: Machine | None = None) -> Binary | None: + """Get a valid Binary for the given name on the current machine, or None if not found.""" + machine = machine or Machine.current() + return ( + self.filter( + machine=machine, + name__iexact=name, + ) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + .first() + ) + + +class Binary(ModelWithHealthStats, ModelWithStateMachine): + """ + Tracks a binary on a specific machine. + + Simple state machine with 2 states: + - queued: Binary needs to be installed + - installed: Binary installed successfully (abspath, version, sha256 populated) + + Installation is synchronous during queued→installed transition. + If installation fails, Binary stays in queued with retry_at set for later retry. + + State machine calls run() which executes on_BinaryRequest__* hooks + to install the binary using the specified providers. + """ + + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + INSTALLED = "installed", "Installed" + + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False) + + # Binary metadata + name = models.CharField(max_length=63, default="", null=False, blank=True, db_index=True) + binproviders = models.CharField( + max_length=127, + default="env", + null=False, + blank=True, + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + ) + overrides = models.JSONField( + default=dict, + blank=True, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ) + + # Installation results (populated after installation) + binprovider = models.CharField( + max_length=31, + default="", + null=False, + blank=True, + help_text="Provider that successfully installed this binary", + ) + abspath = models.CharField(max_length=255, default="", null=False, blank=True) + version = models.CharField(max_length=32, default="", null=False, blank=True) + sha256 = models.CharField(max_length=64, default="", null=False, blank=True) + + # State machine fields + status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16) + retry_at = ModelWithStateMachine.RetryAtField( + default=timezone.now, + help_text="When to retry this binary installation", + ) + + # Health stats + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + machine_id: uuid.UUID + + state_machine_name: str | None = "archivebox.machine.models.BinaryMachine" + active_state: str = StatusChoices.QUEUED + + objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride] + + class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta): + app_label = "machine" + verbose_name = "Binary" + verbose_name_plural = "Binaries" + unique_together = (("machine", "name", "abspath", "version", "sha256"),) + + def __str__(self) -> str: + return f"{self.name}@{self.binprovider}+{self.abspath}@{self.version}" + + @property + def is_valid(self) -> bool: + """A binary is valid if it has a resolved path and is marked installed.""" + return bool(self.abspath) and self.status == self.StatusChoices.INSTALLED + + @cached_property + def binary_info(self) -> dict: + """Return info about the binary.""" + return { + "name": self.name, + "abspath": self.abspath, + "version": self.version, + "binprovider": self.binprovider, + "is_valid": self.is_valid, + } + + @property + def output_dir(self) -> Path: + """ + Get output directory for this binary's hook logs. + Path: data/machines/{machine_uuid}/binaries/{binary_name}/{binary_uuid} + """ + from django.conf import settings + + return Path(settings.DATA_DIR) / "machines" / str(self.machine_id) / "binaries" / self.name / str(self.id) + + def to_json(self) -> dict: + """ + Convert Binary model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + is_installed = bool(self.abspath and self.version) + return { + "type": "Binary" if is_installed else "BinaryRequest", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "name": self.name, + "binproviders": self.binproviders, + "overrides": self.overrides, + "binprovider": self.binprovider, + "abspath": self.abspath, + "version": self.version, + "sha256": self.sha256, + "status": self.status, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Binary from JSON dict. + + Handles two cases: + 1. From binaries.json: creates queued binary with name, binproviders, overrides + 2. From hook output: updates binary with abspath, version, sha256, binprovider + + Args: + record: JSON dict with 'name' and either: + - 'binproviders', 'overrides' (from binaries.json) + - 'abspath', 'version', 'sha256', 'binprovider' (from hook output) + overrides: Not used + + Returns: + Binary instance or None + """ + name = record.get("name") + if not name: + return None + + machine = Machine.current() + overrides = overrides or {} + binary_overrides = record.get("overrides", {}) + normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {} + + # abx-plugins currently emits a GitHub install URL for readability-extractor, + # but the package is published on npm. Prefer the registry package to avoid + # long git-based installs in CI while still using canonical install_args. + if ( + name == "readability-extractor" + and isinstance(normalized_overrides.get("npm"), dict) + and normalized_overrides["npm"].get("install_args") == ["https://github.com/ArchiveBox/readability-extractor"] + ): + normalized_overrides = { + **normalized_overrides, + "npm": { + **normalized_overrides["npm"], + "install_args": ["readability-extractor"], + }, + } + + # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders + # This happens when on_Crawl hooks detect already-installed binaries + abspath = record.get("abspath") + version = record.get("version") + binproviders = record.get("binproviders") + + if abspath and version and binproviders: + # Binary is already installed, create INSTALLED record with binproviders filter + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "binproviders": binproviders, # Preserve the filter + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, + ) + return binary + + # Case 2: From binaries.json - create queued binary (needs installation) + if "binproviders" in record or ("overrides" in record and not abspath): + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "binproviders": record.get("binproviders", "env"), + "overrides": normalized_overrides, + "status": Binary.StatusChoices.QUEUED, + "retry_at": timezone.now(), + }, + ) + return binary + + # Case 3: From on_BinaryRequest__ hook output - update with installation results + if abspath and version: + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, + ) + return binary + + return None + + def update_and_requeue(self, **kwargs) -> bool: + """ + Update binary fields and requeue for worker state machine. + + Sets modified_at to ensure workers pick up changes. + Always saves the model after updating. + """ + for key, value in kwargs.items(): + setattr(self, key, value) + self.modified_at = timezone.now() + self.save() + return True + + def _allowed_binproviders(self) -> set[str] | None: + """Return the allowed binproviders for this binary, or None for wildcard.""" + providers = str(self.binproviders or "").strip() + if not providers or providers == "*": + return None + return {provider.strip() for provider in providers.split(",") if provider.strip()} + + def run(self): + """ + Execute binary installation by running on_BinaryRequest__* hooks. + + Called by BinaryMachine when entering 'started' state. + Runs ALL on_BinaryRequest__* hooks - each hook checks binproviders + and decides if it can handle this binary. First hook to succeed wins. + Updates status to SUCCEEDED or FAILED based on hook output. + """ + import json + from archivebox.hooks import discover_hooks, run_hook + from archivebox.config.configset import get_config + + # Get merged config (Binary doesn't have crawl/snapshot context). + config = get_config() + + # ArchiveBox installs the puppeteer package and Chromium in separate + # hook phases. Suppress puppeteer's bundled browser download during the + # package install step so the dedicated chromium hook owns that work. + if self.name == "puppeteer": + config.setdefault("PUPPETEER_SKIP_DOWNLOAD", "true") + config.setdefault("PUPPETEER_SKIP_CHROMIUM_DOWNLOAD", "true") + + # Create output directory + output_dir = self.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + + # Discover ALL on_BinaryRequest__* hooks + hooks = discover_hooks("BinaryRequest", config=config) + if not hooks: + # No hooks available - stay queued, will retry later + return + + allowed_binproviders = self._allowed_binproviders() + + # Run each hook - they decide if they can handle this binary + for hook in hooks: + plugin_name = hook.parent.name + if allowed_binproviders is not None and plugin_name not in allowed_binproviders: + continue + + plugin_output_dir = output_dir / plugin_name + plugin_output_dir.mkdir(parents=True, exist_ok=True) + + overrides_json = None + if self.overrides: + overrides_json = json.dumps(self.overrides) + + # Run the hook + process = run_hook( + hook, + output_dir=plugin_output_dir, + config=config, + timeout=600, # 10 min timeout for binary installation + binary_id=str(self.id), + machine_id=str(self.machine_id), + name=self.name, + binproviders=self.binproviders, + overrides=overrides_json, + ) + + # Background hook (unlikely for binary installation, but handle it) + if process is None: + continue + + # Failed or skipped hook - try next one + if process.exit_code != 0: + continue + + # Parse JSONL output to check for successful installation + from archivebox.hooks import extract_records_from_process, process_hook_records + + records = extract_records_from_process(process) + if records: + process_hook_records(records, overrides={}) + binary_records = [record for record in records if record.get("type") == "Binary" and record.get("abspath")] + if binary_records: + record = binary_records[0] + # Update self from successful installation + self.abspath = record["abspath"] + self.version = record.get("version", "") + self.sha256 = record.get("sha256", "") + self.binprovider = record.get("binprovider", "env") + self.status = self.StatusChoices.INSTALLED + self.save() + + # Symlink binary into LIB_BIN_DIR if configured + from django.conf import settings + + lib_bin_dir = getattr(settings, "LIB_BIN_DIR", None) + if lib_bin_dir: + self.symlink_to_lib_bin(lib_bin_dir) + + return + + # No hook succeeded - leave status as QUEUED (will retry later) + # Don't set to FAILED since we don't have that status anymore + + def cleanup(self): + """ + Clean up background binary installation hooks. + + Called by state machine if needed (not typically used for binaries + since installations are foreground, but included for consistency). + """ + + # Clean up .pid files from output directory + output_dir = self.output_dir + if output_dir.exists(): + for pid_file in output_dir.glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None: + """ + Symlink this binary into LIB_BIN_DIR for unified PATH management. + + After a binary is installed by any binprovider (pip, npm, brew, apt, etc), + we symlink it into LIB_BIN_DIR so that: + 1. All binaries can be found in a single directory + 2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths) + 3. Binary priorities are clear (symlink points to the canonical install location) + + Args: + lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin) + + Returns: + Path to the created symlink, or None if symlinking failed + + Example: + >>> binary = Binary.objects.get(name='yt-dlp') + >>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin') + Path('/data/lib/arm64-darwin/bin/yt-dlp') + """ + import sys + from pathlib import Path + + if not self.abspath: + return None + + binary_abspath = Path(self.abspath).resolve() + lib_bin_dir = Path(lib_bin_dir).resolve() + binary_parts = binary_abspath.parts + try: + app_index = next(index for index, part in enumerate(binary_parts) if part.endswith(".app")) + except StopIteration: + app_index = -1 + + # Create LIB_BIN_DIR if it doesn't exist + try: + lib_bin_dir.mkdir(parents=True, exist_ok=True) + except (OSError, PermissionError) as e: + print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr) + return None + + # Get binary name (last component of path) + binary_name = binary_abspath.name + symlink_path = lib_bin_dir / binary_name + + if app_index != -1 and len(binary_parts) > app_index + 2 and binary_parts[app_index + 1 : app_index + 3] == ("Contents", "MacOS"): + if symlink_path.exists() or symlink_path.is_symlink(): + try: + symlink_path.unlink() + except (OSError, PermissionError) as e: + print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr) + return None + return binary_abspath + + # Remove existing symlink/file if it exists + if symlink_path.exists() or symlink_path.is_symlink(): + try: + # Check if it's already pointing to the right place + if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath: + # Already correctly symlinked, nothing to do + return symlink_path + + # Remove old symlink/file + symlink_path.unlink() + except (OSError, PermissionError) as e: + print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr) + return None + + # Create new symlink + try: + symlink_path.symlink_to(binary_abspath) + print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr) + return symlink_path + except (OSError, PermissionError) as e: + print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr) + return None + + +# ============================================================================= +# Process Model +# ============================================================================= + + +class ProcessManager(models.Manager): + """Manager for Process model.""" + + def current(self) -> Process: + """Get the Process record for the current OS process.""" + return Process.current() + + def get_by_pid(self, pid: int, machine: Machine | None = None) -> Process | None: + """ + Find a Process by PID with proper validation against PID reuse. + + IMPORTANT: PIDs are reused by the OS! This method: + 1. Filters by machine (required - PIDs are only unique per machine) + 2. Filters by time window (processes older than 24h are stale) + 3. Validates via psutil that start times match + + Args: + pid: OS process ID + machine: Machine instance (defaults to current machine) + + Returns: + Process if found and validated, None otherwise + """ + if not PSUTIL_AVAILABLE: + return None + + machine = machine or Machine.current() + + # Get the actual process start time from OS + try: + os_proc = psutil.Process(pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process doesn't exist - any DB record with this PID is stale + return None + + # Query candidates: same machine, same PID, recent, still RUNNING + candidates = self.filter( + machine=machine, + pid=pid, + status=Process.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by("-started_at") + + for candidate in candidates: + # Validate start time matches (within tolerance) + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + return candidate + + return None + + def create_for_archiveresult(self, archiveresult, **kwargs): + """ + Create a Process record for an ArchiveResult. + + Called during migration and when creating new ArchiveResults. + """ + iface = kwargs.get("iface") or NetworkInterface.current() + + # Defaults from ArchiveResult if not provided + defaults = { + "machine": iface.machine, + "pwd": kwargs.get("pwd") or str(archiveresult.snapshot.output_dir / archiveresult.plugin), + "cmd": kwargs.get("cmd") or [], + "status": "queued", + "timeout": kwargs.get("timeout", 120), + "env": kwargs.get("env", {}), + "iface": iface, + } + defaults.update(kwargs) + + process = self.create(**defaults) + return process + + +class Process(models.Model): + """ + Tracks a single OS process execution. + + Process represents the actual subprocess spawned to execute a hook. + One Process can optionally be associated with an ArchiveResult (via OneToOne), + but Process can also exist standalone for internal operations. + + Follows the unified state machine pattern: + - queued: Process ready to launch + - running: Process actively executing + - exited: Process completed (check exit_code for success/failure) + + State machine calls launch() to spawn the process and monitors its lifecycle. + """ + + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + RUNNING = "running", "Running" + EXITED = "exited", "Exited" + + class TypeChoices(models.TextChoices): + SUPERVISORD = "supervisord", "Supervisord" + ORCHESTRATOR = "orchestrator", "Orchestrator" + WORKER = "worker", "Worker" + CLI = "cli", "CLI" + HOOK = "hook", "Hook" + BINARY = "binary", "Binary" + + # Primary fields + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + # Machine FK - required (every process runs on a machine) + machine = models.ForeignKey( + Machine, + on_delete=models.CASCADE, + null=False, + related_name="process_set", + help_text="Machine where this process executed", + ) + + # Parent process (optional) + parent = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="children", + help_text="Parent process that spawned this process", + ) + + # Process type (cli, worker, orchestrator, binary, supervisord) + process_type = models.CharField( + max_length=16, + choices=TypeChoices.choices, + default=TypeChoices.CLI, + db_index=True, + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + ) + + # Worker type (only for WORKER processes: crawl, snapshot, archiveresult) + worker_type = models.CharField( + max_length=32, + default="", + null=False, + blank=True, + db_index=True, + help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)", + ) + + # Execution metadata + pwd = models.CharField( + max_length=512, + default="", + null=False, + blank=True, + help_text="Working directory for process execution", + ) + cmd = models.JSONField( + default=list, + null=False, + blank=True, + help_text="Command as array of arguments", + ) + env = models.JSONField( + default=dict, + null=False, + blank=True, + help_text="Environment variables for process", + ) + timeout = models.IntegerField( + default=120, + null=False, + help_text="Timeout in seconds", + ) + + # Process results + pid = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="OS process ID", + ) + exit_code = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="Process exit code (0 = success)", + ) + stdout = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard output from process", + ) + stderr = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard error from process", + ) + + # Timing + started_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process was launched", + ) + ended_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process completed/terminated", + ) + + # Optional FKs + binary = models.ForeignKey( + Binary, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="process_set", + help_text="Binary used by this process", + ) + iface = models.ForeignKey( + NetworkInterface, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="process_set", + help_text="Network interface used by this process", + ) + + # Optional connection URL (for CDP, sonic, etc.) + url = models.URLField( + max_length=2048, + default=None, + null=True, + blank=True, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + ) + + # Reverse relation to ArchiveResult (OneToOne from AR side) + # archiveresult: OneToOneField defined on ArchiveResult model + + # State machine fields + status = models.CharField( + max_length=16, + choices=StatusChoices.choices, + default=StatusChoices.QUEUED, + db_index=True, + ) + retry_at = models.DateTimeField( + default=timezone.now, + null=True, + blank=True, + db_index=True, + help_text="When to retry this process", + ) + + machine_id: uuid.UUID + parent_id: uuid.UUID | None + binary_id: uuid.UUID | None + children: models.Manager[Process] + archiveresult: ArchiveResult + + state_machine_name: str = "archivebox.machine.models.ProcessMachine" + + objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride] + + class Meta(TypedModelMeta): + app_label = "machine" + verbose_name = "Process" + verbose_name_plural = "Processes" + indexes = [ + models.Index(fields=["machine", "status", "retry_at"]), + models.Index(fields=["binary", "exit_code"]), + ] + + def __str__(self) -> str: + cmd_str = " ".join(self.cmd[:3]) if self.cmd else "(no cmd)" + return f"Process[{self.id}] {cmd_str} ({self.status})" + + # Properties that delegate to related objects + @property + def cmd_version(self) -> str: + """Get version from associated binary.""" + return self.binary.version if self.binary else "" + + @property + def bin_abspath(self) -> str: + """Get absolute path from associated binary.""" + return self.binary.abspath if self.binary else "" + + @property + def plugin(self) -> str: + """Get plugin name from associated ArchiveResult (if any).""" + if hasattr(self, "archiveresult"): + # Inline import to avoid circular dependency + return self.archiveresult.plugin + return "" + + @property + def hook_name(self) -> str: + """Get hook name from associated ArchiveResult (if any).""" + if hasattr(self, "archiveresult"): + return self.archiveresult.hook_name + return "" + + def to_json(self) -> dict: + """ + Convert Process model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + record = { + "type": "Process", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "cmd": self.cmd, + "pwd": self.pwd, + "status": self.status, + "exit_code": self.exit_code, + "started_at": self.started_at.isoformat() if self.started_at else None, + "ended_at": self.ended_at.isoformat() if self.ended_at else None, + } + # Include optional fields if set + if self.binary_id: + record["binary_id"] = str(self.binary_id) + if self.pid: + record["pid"] = self.pid + if self.timeout: + record["timeout"] = self.timeout + return record + + def hydrate_binary_from_context(self, *, plugin_name: str = "", hook_path: str = "") -> Binary | None: + machine = self.machine if self.machine_id else Machine.current() + + references: list[str] = [] + for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env): + value = str(self.env.get(key) or "").strip() + if value and value not in references: + references.append(value) + + if self.cmd: + cmd_0 = str(self.cmd[0]).strip() + if cmd_0 and cmd_0 not in references: + references.append(cmd_0) + + for reference in references: + binary = _find_existing_binary_for_reference(machine, reference) + if binary: + self.binary = binary + return binary + + return None + + @classmethod + def parse_records_from_text(cls, text: str) -> list[dict]: + """Parse JSONL records from raw text using the shared JSONL parser.""" + from archivebox.misc.jsonl import parse_line + + records: list[dict] = [] + if not text: + return records + for line in text.splitlines(): + record = parse_line(line) + if record and record.get("type"): + records.append(record) + return records + + def get_records(self) -> list[dict]: + """Parse JSONL records from this process's stdout.""" + stdout = self.stdout + if not stdout and self.stdout_file and self.stdout_file.exists(): + stdout = self.stdout_file.read_text() + return self.parse_records_from_text(stdout or "") + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Process from JSON dict. + + Args: + record: JSON dict with 'id' or process details + overrides: Optional dict of field overrides + + Returns: + Process instance or None + """ + process_id = record.get("id") + if process_id: + try: + return Process.objects.get(id=process_id) + except Process.DoesNotExist: + pass + return None + + def update_and_requeue(self, **kwargs) -> bool: + """ + Update process fields and requeue for worker state machine. + Sets modified_at to ensure workers pick up changes. + """ + for key, value in kwargs.items(): + setattr(self, key, value) + self.modified_at = timezone.now() + self.save() + return True + + # ========================================================================= + # Process.current() and hierarchy methods + # ========================================================================= + + @classmethod + def current(cls) -> Process: + """ + Get or create the Process record for the current OS process. + + Similar to Machine.current(), this: + 1. Checks cache for existing Process with matching PID + 2. Validates the cached Process is still valid (PID not reused) + 3. Creates new Process if needed + + IMPORTANT: Uses psutil to validate PID hasn't been reused. + PIDs are recycled by OS, so we compare start times. + """ + global _CURRENT_PROCESS + + current_pid = os.getpid() + machine = Machine.current() + iface = NetworkInterface.current() + + # Check cache validity + if _CURRENT_PROCESS: + # Verify: same PID, same machine, cache not expired + if ( + _CURRENT_PROCESS.pid == current_pid + and _CURRENT_PROCESS.machine_id == machine.id + and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL) + ): + if _CURRENT_PROCESS.iface_id != iface.id: + _CURRENT_PROCESS.iface = iface + _CURRENT_PROCESS.save(update_fields=["iface", "modified_at"]) + _CURRENT_PROCESS.ensure_log_files() + return _CURRENT_PROCESS + _CURRENT_PROCESS = None + + # Get actual process start time from OS for validation + os_start_time = None + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Try to find existing Process for this PID on this machine + # Filter by: machine + PID + RUNNING + recent + start time matches + if os_start_time: + existing = ( + cls.objects.filter( + machine=machine, + pid=current_pid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ) + .order_by("-started_at") + .first() + ) + + if existing and existing.started_at: + db_start_time = existing.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + _CURRENT_PROCESS = existing + if existing.iface_id != iface.id: + existing.iface = iface + existing.save(update_fields=["iface", "modified_at"]) + _CURRENT_PROCESS.ensure_log_files() + return existing + + # No valid existing record - create new one + parent = cls._find_parent_process(machine) + process_type = cls._detect_process_type() + + # Use psutil cmdline if available (matches what proc() will validate against) + # Otherwise fall back to sys.argv + cmd = sys.argv + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + cmd = os_proc.cmdline() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Use psutil start time if available (more accurate than timezone.now()) + if os_start_time: + started_at = datetime.fromtimestamp(os_start_time, tz=timezone.get_current_timezone()) + else: + started_at = timezone.now() + + _CURRENT_PROCESS = cls.objects.create( + machine=machine, + parent=parent, + process_type=process_type, + cmd=cmd, + pwd=os.getcwd(), + pid=current_pid, + started_at=started_at, + status=cls.StatusChoices.RUNNING, + iface=iface, + ) + _CURRENT_PROCESS.ensure_log_files() + return _CURRENT_PROCESS + + @classmethod + def _find_parent_process(cls, machine: Machine | None = None) -> Process | None: + """ + Find the parent Process record by looking up PPID. + + IMPORTANT: Validates against PID reuse by checking: + 1. Same machine (PIDs are only unique per machine) + 2. Start time matches OS process start time + 3. Process is still RUNNING and recent + + Returns None if parent is not an ArchiveBox process. + """ + if not PSUTIL_AVAILABLE: + return None + + ppid = os.getppid() + machine = machine or Machine.current() + + # Debug logging + # print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) + + # Get parent process start time from OS + try: + os_parent = psutil.Process(ppid) + os_parent_start = os_parent.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) + return None # Parent process doesn't exist + + # Find matching Process record + candidates = cls.objects.filter( + machine=machine, + pid=ppid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by("-started_at") + + # print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) + + for candidate in candidates: + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + time_diff = abs(db_start_time - os_parent_start) + # print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) + if time_diff < START_TIME_TOLERANCE: + # print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) + return candidate + + # print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) + return None # No matching ArchiveBox parent process + + @classmethod + def _detect_process_type(cls) -> str: + """ + Detect the type of the current process from sys.argv. + """ + argv_str = " ".join(sys.argv).lower() + + if "supervisord" in argv_str: + return cls.TypeChoices.SUPERVISORD + elif "runner_watch" in argv_str: + return cls.TypeChoices.WORKER + elif "archivebox run" in argv_str: + return cls.TypeChoices.ORCHESTRATOR + elif "archivebox" in argv_str: + return cls.TypeChoices.CLI + else: + return cls.TypeChoices.BINARY + + @classmethod + def cleanup_stale_running(cls, machine: Machine | None = None) -> int: + """ + Mark stale RUNNING processes as EXITED in the DB. + + Processes are stale if: + - Status is RUNNING but OS process no longer exists + - Status is RUNNING but exceeded its timeout plus a small grace margin + - Status is RUNNING but started_at is older than PID_REUSE_WINDOW + + Returns count of processes cleaned up. + """ + machine = machine or Machine.current() + cleaned = 0 + + stale = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + + for proc in stale: + if proc.poll() is not None: + cleaned += 1 + continue + + is_stale = False + + if proc.started_at: + timeout_seconds = max(int(proc.timeout or 0), 0) + timeout_deadline = proc.started_at + timedelta(seconds=timeout_seconds) + PROCESS_TIMEOUT_GRACE + if timezone.now() >= timeout_deadline: + is_stale = True + + # Check if too old (PID definitely reused) + if not is_stale and proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: + is_stale = True + elif not is_stale and PSUTIL_AVAILABLE and proc.pid is not None: + # Check if OS process still exists with matching start time + try: + os_proc = psutil.Process(proc.pid) + if proc.started_at: + db_start = proc.started_at.timestamp() + os_start = os_proc.create_time() + if abs(db_start - os_start) > START_TIME_TOLERANCE: + is_stale = True # PID reused by different process + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + is_stale = True # Process no longer exists + + if is_stale: + proc.status = cls.StatusChoices.EXITED + proc.ended_at = proc.ended_at or timezone.now() + proc.exit_code = proc.exit_code if proc.exit_code is not None else 0 + proc.save(update_fields=["status", "ended_at", "exit_code"]) + cleaned += 1 + + return cleaned + + # ========================================================================= + # Tree traversal properties + # ========================================================================= + + @property + def root(self) -> Process: + """Get the root process (CLI command) of this hierarchy.""" + proc = self + while proc.parent_id: + proc = proc.parent + return proc + + @property + def ancestors(self) -> list[Process]: + """Get all ancestor processes from parent to root.""" + ancestors = [] + proc = self.parent + while proc: + ancestors.append(proc) + proc = proc.parent + return ancestors + + @property + def depth(self) -> int: + """Get depth in the process tree (0 = root).""" + return len(self.ancestors) + + def get_descendants(self, include_self: bool = False): + """Get all descendant processes recursively.""" + if include_self: + pks = [self.pk] + else: + pks = [] + + children = list(self.children.values_list("pk", flat=True)) + while children: + pks.extend(children) + children = list(Process.objects.filter(parent_id__in=children).values_list("pk", flat=True)) + + return Process.objects.filter(pk__in=pks) + + # ========================================================================= + # Validated psutil access via .proc property + # ========================================================================= + + @property + def proc(self) -> psutil.Process | None: + """ + Get validated psutil.Process for this record. + + Returns psutil.Process ONLY if: + 1. Process with this PID exists in OS + 2. OS process start time matches our started_at (within tolerance) + 3. Process is on current machine + + Returns None if: + - PID doesn't exist (process exited) + - PID was reused by a different process (start times don't match) + - We're on a different machine than where process ran + - psutil is not available + + This prevents accidentally matching a stale/recycled PID. + """ + if not PSUTIL_AVAILABLE: + return None + + # Can't get psutil.Process if we don't have a PID + if not self.pid: + return None + + # Can't validate processes on other machines + if self.machine_id != Machine.current().id: + return None + + try: + os_proc = psutil.Process(self.pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Process no longer exists + + # Validate start time matches to prevent PID reuse confusion + if self.started_at: + os_start_time = os_proc.create_time() + db_start_time = self.started_at.timestamp() + + if abs(os_start_time - db_start_time) > START_TIME_TOLERANCE: + # PID has been reused by a different process! + return None + + # Optionally validate command matches (extra safety) + if self.cmd: + try: + os_cmdline = os_proc.cmdline() + if os_cmdline and self.cmd: + db_binary = self.cmd[0] if self.cmd else "" + if db_binary: + db_binary_name = Path(db_binary).name + cmd_matches = any(arg == db_binary or Path(arg).name == db_binary_name for arg in os_cmdline if arg) + if not cmd_matches: + return None # Different command, PID reused + except (psutil.AccessDenied, psutil.ZombieProcess): + pass # Can't check cmdline, trust start time match + + return os_proc + + @property + def is_running(self) -> bool: + """ + Check if process is currently running via psutil. + + More reliable than checking status field since it validates + the actual OS process exists and matches our record. + """ + proc = self.proc + if proc is None: + return False + try: + # Treat zombies as not running (they should be reaped) + if proc.status() == psutil.STATUS_ZOMBIE: + return False + except Exception: + pass + return proc.is_running() + + def is_alive(self) -> bool: + """ + Alias for is_running, for compatibility with subprocess.Popen API. + """ + return self.is_running + + def get_memory_info(self) -> dict | None: + """Get memory usage if process is running.""" + proc = self.proc + if proc: + try: + mem = proc.memory_info() + return {"rss": mem.rss, "vms": mem.vms} + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_cpu_percent(self) -> float | None: + """Get CPU usage percentage if process is running.""" + proc = self.proc + if proc: + try: + return proc.cpu_percent(interval=0.1) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_children_pids(self) -> list[int]: + """Get PIDs of child processes from OS (not DB).""" + proc = self.proc + if proc: + try: + return [child.pid for child in proc.children(recursive=True)] + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return [] + + # ========================================================================= + # Lifecycle methods (launch, kill, poll, wait) + # ========================================================================= + + @property + def pid_file(self) -> Path | None: + """Path to PID file for this process.""" + runtime_dir = self.runtime_dir + return runtime_dir / "process.pid" if runtime_dir else None + + @property + def cmd_file(self) -> Path | None: + """Path to cmd.sh script for this process.""" + runtime_dir = self.runtime_dir + return runtime_dir / "cmd.sh" if runtime_dir else None + + @property + def stdout_file(self) -> Path | None: + """Path to stdout log.""" + runtime_dir = self.runtime_dir + return runtime_dir / "stdout.log" if runtime_dir else None + + @property + def stderr_file(self) -> Path | None: + """Path to stderr log.""" + runtime_dir = self.runtime_dir + return runtime_dir / "stderr.log" if runtime_dir else None + + @property + def hook_script_name(self) -> str | None: + """Best-effort hook filename extracted from the process command.""" + if self.process_type != self.TypeChoices.HOOK or not self.cmd: + return None + + for arg in self.cmd: + arg = str(arg) + if arg.startswith("-"): + continue + candidate = Path(arg).name + if candidate.startswith("on_") and Path(candidate).suffix in {".py", ".js", ".sh"}: + return candidate + + return None + + @property + def runtime_dir(self) -> Path | None: + """Directory where this process stores runtime logs/pid/cmd metadata.""" + if not self.pwd: + return None + + base_dir = Path(self.pwd) + hook_name = self.hook_script_name + if hook_name: + return base_dir / ".hooks" / hook_name + return base_dir + + def tail_stdout(self, lines: int = 50, follow: bool = False): + """ + Tail stdout log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stdout + """ + if not self.stdout_file or not self.stdout_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + + with open(self.stdout_file) as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip("\n") + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip("\n") + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stdout_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def tail_stderr(self, lines: int = 50, follow: bool = False): + """ + Tail stderr log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stderr + """ + if not self.stderr_file or not self.stderr_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + + with open(self.stderr_file) as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip("\n") + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip("\n") + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stderr_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def pipe_stdout(self, lines: int = 10, follow: bool = True): + """ + Pipe stdout to sys.stdout. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + + for line in self.tail_stdout(lines=lines, follow=follow): + print(line, file=sys.stdout, flush=True) + + def pipe_stderr(self, lines: int = 10, follow: bool = True): + """ + Pipe stderr to sys.stderr. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + + for line in self.tail_stderr(lines=lines, follow=follow): + print(line, file=sys.stderr, flush=True) + + def _write_pid_file(self) -> None: + """Write PID file with mtime set to process start time.""" + if self.pid and self.started_at and self.pid_file: + self.pid_file.parent.mkdir(parents=True, exist_ok=True) + # Write PID to file + self.pid_file.write_text(str(self.pid)) + # Set mtime to process start time for validation + try: + start_time = self.started_at.timestamp() + os.utime(self.pid_file, (start_time, start_time)) + except OSError: + pass # mtime optional, validation degrades gracefully + + def _write_cmd_file(self) -> None: + """Write cmd.sh script for debugging/validation.""" + if self.cmd and self.cmd_file: + self.cmd_file.parent.mkdir(parents=True, exist_ok=True) + + # Escape shell arguments (quote if contains space, ", or $) + def escape(arg: str) -> str: + return f'"{arg.replace(chr(34), chr(92) + chr(34))}"' if any(c in arg for c in ' "$') else arg + + # Write executable shell script + script = "#!/bin/bash\n" + " ".join(escape(arg) for arg in self.cmd) + "\n" + self.cmd_file.write_text(script) + try: + self.cmd_file.chmod(0o755) + except OSError: + pass + + def ensure_log_files(self) -> None: + """Ensure stdout/stderr log files exist for this process.""" + runtime_dir = self.runtime_dir + if not runtime_dir: + return + try: + runtime_dir.mkdir(parents=True, exist_ok=True) + except OSError: + return + try: + if self.stdout_file: + self.stdout_file.parent.mkdir(parents=True, exist_ok=True) + self.stdout_file.touch(exist_ok=True) + if self.stderr_file: + self.stderr_file.parent.mkdir(parents=True, exist_ok=True) + self.stderr_file.touch(exist_ok=True) + except OSError: + return + + def _build_env(self) -> dict: + """Build environment dict for subprocess, merging stored env with system.""" + import json + + env = os.environ.copy() + + # Convert all values to strings for subprocess.Popen + if self.env: + for key, value in self.env.items(): + if value is None: + continue + elif isinstance(value, str): + env[key] = value # Already a string, use as-is + elif isinstance(value, bool): + env[key] = "True" if value else "False" + elif isinstance(value, (int, float)): + env[key] = str(value) + else: + # Lists, dicts, etc. - serialize to JSON + env[key] = json.dumps(value, default=str) + + return env + + def launch(self, background: bool = False, cwd: str | None = None) -> Process: + """ + Spawn the subprocess and update this Process record. + + Args: + background: If True, don't wait for completion (for daemons/bg hooks) + cwd: Working directory for the subprocess (defaults to self.pwd) + + Returns: + self (updated with pid, started_at, etc.) + """ + import subprocess + + # Validate pwd is set (required for output files) + if not self.pwd: + raise ValueError("Process.pwd must be set before calling launch()") + + # Use provided cwd or default to pwd + working_dir = cwd or self.pwd + + # Write cmd.sh for debugging + self._write_cmd_file() + + stdout_path = self.stdout_file + stderr_path = self.stderr_file + if stdout_path: + stdout_path.parent.mkdir(parents=True, exist_ok=True) + if stderr_path: + stderr_path.parent.mkdir(parents=True, exist_ok=True) + if stdout_path is None or stderr_path is None: + raise RuntimeError("Process log paths could not be determined") + + with open(stdout_path, "a") as out, open(stderr_path, "a") as err: + proc = subprocess.Popen( + self.cmd, + cwd=working_dir, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + # Get accurate start time from psutil if available + if PSUTIL_AVAILABLE: + try: + ps_proc = psutil.Process(proc.pid) + self.started_at = datetime.fromtimestamp( + ps_proc.create_time(), + tz=timezone.get_current_timezone(), + ) + except (psutil.NoSuchProcess, psutil.AccessDenied): + self.started_at = timezone.now() + else: + self.started_at = timezone.now() + + self.pid = proc.pid + self.status = self.StatusChoices.RUNNING + self.save() + + self._write_pid_file() + + if not background: + try: + proc.wait(timeout=self.timeout) + self.exit_code = proc.returncode + except subprocess.TimeoutExpired: + import signal + + proc.kill() + proc.wait() + self.exit_code = 128 + signal.SIGKILL + + self.ended_at = timezone.now() + if stdout_path.exists(): + self.stdout = stdout_path.read_text() + if stderr_path.exists(): + self.stderr = stderr_path.read_text() + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def kill(self, signal_num: int = 15) -> bool: + """ + Kill this process and update status. + + Uses self.proc for safe killing - only kills if PID matches + our recorded process (prevents killing recycled PIDs). + + Args: + signal_num: Signal to send (default SIGTERM=15) + + Returns: + True if killed successfully, False otherwise + """ + # Use validated psutil.Process to ensure we're killing the right process + proc = self.proc + if proc is None: + # Process doesn't exist or PID was recycled - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Safe to kill - we validated it's our process via start time match + proc.send_signal(signal_num) + + # Update our record + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal_num + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + # Clean up PID file + if self.pid_file and self.pid_file.exists(): + self.pid_file.unlink(missing_ok=True) + + return True + except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError): + # Process already exited between proc check and kill + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def poll(self) -> int | None: + """ + Check if process has exited and update status if so. + + Cleanup when process exits: + - Copy stdout/stderr to DB (keep files for debugging) + - Delete PID file + + Returns: + exit_code if exited, None if still running + """ + if self.status == self.StatusChoices.EXITED: + if self.exit_code == -1: + self.exit_code = 137 + self.save(update_fields=["exit_code"]) + return self.exit_code + + if not self.is_running: + # Reap child process if it's a zombie (best-effort) + proc = self.proc + if proc is not None: + try: + proc.wait(timeout=0) + except Exception: + pass + # Process exited - read output and copy to DB + if self.stdout_file and self.stdout_file.exists(): + self.stdout = self.stdout_file.read_text() + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stdout_file.unlink(missing_ok=True) + if self.stderr_file and self.stderr_file.exists(): + self.stderr = self.stderr_file.read_text() + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stderr_file.unlink(missing_ok=True) + + # Clean up PID file (not needed for debugging) + if self.pid_file and self.pid_file.exists(): + self.pid_file.unlink(missing_ok=True) + + # TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now) + # if self.pwd: + # cmd_file = Path(self.pwd) / 'cmd.sh' + # if cmd_file.exists(): + # cmd_file.unlink(missing_ok=True) + + # Try to get exit code from proc or default to unknown + self.exit_code = self.exit_code if self.exit_code is not None else 0 + if self.exit_code == -1: + self.exit_code = 137 + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + return self.exit_code + + return None # Still running + + def wait(self, timeout: int | None = None) -> int: + """ + Wait for process to exit, polling periodically. + + Args: + timeout: Max seconds to wait (None = use self.timeout) + + Returns: + exit_code + + Raises: + TimeoutError if process doesn't exit in time + """ + import time + from archivebox.config.constants import CONSTANTS + + timeout = timeout or self.timeout + if self.process_type == self.TypeChoices.HOOK: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) + start = time.time() + + while True: + exit_code = self.poll() + if exit_code is not None: + return exit_code + + if time.time() - start > timeout: + raise TimeoutError(f"Process {self.id} did not exit within {timeout}s") + + time.sleep(0.1) + + def terminate(self, graceful_timeout: float = 5.0) -> bool: + """ + Gracefully terminate process: SIGTERM → wait → SIGKILL. + + This consolidates the scattered SIGTERM/SIGKILL logic from: + - crawls/models.py Crawl.cleanup() + - workers/pid_utils.py stop_worker() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + True if process was terminated, False if already dead + """ + import signal + + proc = self.proc + if proc is None: + # Already dead - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Step 1: Send SIGTERM for graceful shutdown + proc.terminate() + + # Step 2: Wait for graceful exit + try: + exit_status = proc.wait(timeout=graceful_timeout) + # Process exited gracefully + # psutil.Process.wait() returns the exit status + self.exit_code = exit_status if exit_status is not None else 0 + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + except psutil.TimeoutExpired: + pass # Still running, need to force kill + + # Step 3: Force kill with SIGKILL + proc.kill() + proc.wait(timeout=2) + + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal.SIGKILL + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def kill_tree(self, graceful_timeout: float = 2.0) -> int: + """ + Kill this process and all its children (OS children, not DB children) in parallel. + + Uses parallel polling approach - sends SIGTERM to all processes at once, + then polls all simultaneously with individual deadline tracking. + + This consolidates the scattered child-killing logic from: + - crawls/models.py Crawl.cleanup() os.killpg() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + Number of processes killed (including self) + """ + import signal + import time + import os + + killed_count = 0 + used_sigkill = False + proc = self.proc + if proc is None: + # Already dead + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return 0 + + try: + # Phase 1: Get all children and send SIGTERM to entire tree in parallel + children = proc.children(recursive=True) + deadline = time.time() + graceful_timeout + + # Send SIGTERM to all children first (non-blocking) + for child in children: + try: + os.kill(child.pid, signal.SIGTERM) + except (OSError, ProcessLookupError): + pass + + # Send SIGTERM to parent + try: + os.kill(proc.pid, signal.SIGTERM) + except (OSError, ProcessLookupError): + pass + + # Phase 2: Poll all processes in parallel + all_procs = children + [proc] + still_running = {p.pid for p in all_procs} + + while still_running and time.time() < deadline: + time.sleep(0.1) + + for pid in list(still_running): + try: + # Check if process exited + os.kill(pid, 0) # Signal 0 checks if process exists + except (OSError, ProcessLookupError): + # Process exited + still_running.remove(pid) + killed_count += 1 + + # Phase 3: SIGKILL any stragglers that exceeded timeout + if still_running: + for pid in still_running: + try: + os.kill(pid, signal.SIGKILL) + killed_count += 1 + used_sigkill = True + except (OSError, ProcessLookupError): + pass + + # Update self status + if used_sigkill: + self.exit_code = 128 + signal.SIGKILL + else: + self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0 + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + + return killed_count + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process tree already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return killed_count + + def kill_children_db(self) -> int: + """ + Kill all DB-tracked child processes (via parent FK). + + Different from kill_tree() which uses OS children. + This kills processes created via Process.create(parent=self). + + Returns: + Number of child Process records killed + """ + killed = 0 + for child in self.children.filter(status=self.StatusChoices.RUNNING): + if child.terminate(): + killed += 1 + return killed + + # ========================================================================= + # Class methods for querying processes + # ========================================================================= + + @classmethod + def get_running(cls, process_type: str | None = None, machine: Machine | None = None) -> QuerySet[Process]: + """ + Get all running processes, optionally filtered by type. + + Replaces: + - workers/pid_utils.py get_all_worker_pids() + - workers/orchestrator.py get_total_worker_count() + + Args: + process_type: Filter by TypeChoices (e.g., 'worker', 'hook') + machine: Filter by machine (defaults to current) + + Returns: + QuerySet of running Process records + """ + machine = machine or Machine.current() + qs = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + if process_type: + qs = qs.filter(process_type=process_type) + return qs + + @classmethod + def get_running_count(cls, process_type: str | None = None, machine: Machine | None = None) -> int: + """ + Get count of running processes. + + Replaces: + - workers/pid_utils.py get_running_worker_count() + """ + return cls.get_running(process_type=process_type, machine=machine).count() + + @classmethod + def stop_all(cls, process_type: str | None = None, machine: Machine | None = None, graceful: bool = True) -> int: + """ + Stop all running processes of a given type. + + Args: + process_type: Filter by TypeChoices + machine: Filter by machine + graceful: If True, use terminate() (SIGTERM→SIGKILL), else kill() + + Returns: + Number of processes stopped + """ + stopped = 0 + for proc in cls.get_running(process_type=process_type, machine=machine): + if graceful: + if proc.terminate(): + stopped += 1 + else: + if proc.kill(): + stopped += 1 + return stopped + + @classmethod + def get_next_worker_id(cls, process_type: str = "worker", machine: Machine | None = None) -> int: + """ + Get the next available worker ID for spawning new workers. + + Replaces workers/pid_utils.py get_next_worker_id(). + Simply returns count of running workers of this type. + + Args: + process_type: Worker type to count + machine: Machine to scope query + + Returns: + Next available worker ID (0-indexed) + """ + return cls.get_running_count(process_type=process_type, machine=machine) + + @classmethod + def cleanup_orphaned_chrome(cls) -> int: + """ + Kill orphaned Chrome processes using chrome_utils.js killZombieChrome. + + Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old) + and kills any orphaned Chrome processes. + + Called by: + - Orchestrator on startup (cleanup from previous crashes) + - Orchestrator periodically (every N minutes) + + Returns: + Number of zombie Chrome processes killed + """ + import subprocess + from pathlib import Path + from django.conf import settings + + chrome_utils = Path(__file__).parent.parent / "plugins" / "chrome" / "chrome_utils.js" + if not chrome_utils.exists(): + return 0 + + try: + result = subprocess.run( + ["node", str(chrome_utils), "killZombieChrome", str(settings.DATA_DIR)], + capture_output=True, + timeout=30, + text=True, + ) + if result.returncode == 0: + killed = int(result.stdout.strip()) + if killed > 0: + print(f"[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]") + return killed + except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e: + print(f"[red]Failed to cleanup orphaned Chrome: {e}[/red]") + + return 0 + + @classmethod + def cleanup_orphaned_workers(cls) -> int: + """ + Mark orphaned worker/hook processes as EXITED in the DB. + + Orphaned if: + - Root (orchestrator/cli) is not running, or + - No orchestrator/cli ancestor exists. + + Standalone worker runs (archivebox run --snapshot-id) are allowed. + """ + cleaned = 0 + + running_children = cls.objects.filter( + process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK], + status=cls.StatusChoices.RUNNING, + ) + + for proc in running_children: + if not proc.is_running: + continue + + root = proc.root + # Standalone worker/hook process (run directly) + if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK): + continue + + # If root is an active orchestrator/cli, keep it + if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running: + continue + + proc.status = cls.StatusChoices.EXITED + proc.ended_at = proc.ended_at or timezone.now() + proc.exit_code = proc.exit_code if proc.exit_code is not None else 0 + proc.save(update_fields=["status", "ended_at", "exit_code"]) + cleaned += 1 + + if cleaned: + print(f"[yellow]🧹 Cleaned up {cleaned} orphaned worker/hook process record(s)[/yellow]") + return cleaned + + +# ============================================================================= +# Binary State Machine +# ============================================================================= + + +class BinaryMachine(BaseStateMachine): + """ + State machine for managing Binary installation lifecycle. + + Simple 2-state machine: + ┌─────────────────────────────────────────────────────────────┐ + │ QUEUED State │ + │ â€ĸ Binary needs to be installed │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when can_install() + ↓ Synchronous installation during transition + ┌─────────────────────────────────────────────────────────────┐ + │ INSTALLED State │ + │ â€ĸ Binary installed (abspath, version, sha256 set) │ + │ â€ĸ Health stats incremented │ + └─────────────────────────────────────────────────────────────┘ + + If installation fails, Binary stays in QUEUED with retry_at bumped. + """ + + model_attr_name = "binary" + binary: Binary + + # States + queued = State(value=Binary.StatusChoices.QUEUED, initial=True) + installed = State(value=Binary.StatusChoices.INSTALLED, final=True) + + # Tick Event - install happens during transition + tick = queued.to.itself(unless="can_install") | queued.to(installed, cond="can_install", on="on_install") + + def can_install(self) -> bool: + """Check if binary installation can start.""" + return bool(self.binary.name and self.binary.binproviders) + + @queued.enter + def enter_queued(self): + """Binary is queued for installation.""" + self.binary.update_and_requeue( + retry_at=timezone.now(), + status=Binary.StatusChoices.QUEUED, + ) + + def on_install(self): + """Called during queued→installed transition. Runs installation synchronously.""" + import sys + + print(f"[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]", file=sys.stderr) + + # Run installation hooks (synchronous, updates abspath/version/sha256 and sets status) + self.binary.run() + + # Check if installation succeeded by looking at updated status + # Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference + self.binary.refresh_from_db() + + if self.binary.status != Binary.StatusChoices.INSTALLED: + # Installation failed - abort transition, stay in queued + print(f"[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]", file=sys.stderr) + + # Bump retry_at to try again later + self.binary.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes + status=Binary.StatusChoices.QUEUED, # Ensure we stay queued + ) + + # Increment health stats for failure + self.binary.increment_health_stats(success=False) + + # Abort the transition - this will raise an exception and keep us in queued + raise Exception(f"Binary {self.binary.name} installation failed") + + print(f"[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]", file=sys.stderr) + + @installed.enter + def enter_installed(self): + """Binary installed successfully.""" + self.binary.update_and_requeue( + retry_at=None, + status=Binary.StatusChoices.INSTALLED, + ) + + # Increment health stats + self.binary.increment_health_stats(success=True) + + +# ============================================================================= +# Process State Machine +# ============================================================================= + + +class ProcessMachine(BaseStateMachine): + """ + State machine for managing Process (OS subprocess) lifecycle. + + Process Lifecycle: + ┌─────────────────────────────────────────────────────────────┐ + │ QUEUED State │ + │ â€ĸ Process ready to launch, waiting for resources │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() when can_start() + ┌─────────────────────────────────────────────────────────────┐ + │ RUNNING State → enter_running() │ + │ 1. process.launch() │ + │ â€ĸ Spawn subprocess with cmd, pwd, env, timeout │ + │ â€ĸ Set pid, started_at │ + │ â€ĸ Process runs in background or foreground │ + │ 2. Monitor process completion │ + │ â€ĸ Check exit code when process completes │ + └─────────────────────────────────────────────────────────────┘ + ↓ tick() checks is_exited() + ┌─────────────────────────────────────────────────────────────┐ + │ EXITED State │ + │ â€ĸ Process completed (exit_code set) │ + │ â€ĸ Health stats incremented │ + │ â€ĸ stdout/stderr captured │ + └─────────────────────────────────────────────────────────────┘ + + Note: This is a simpler state machine than ArchiveResult. + Process is just about execution lifecycle. ArchiveResult handles + the archival-specific logic (status, output parsing, etc.). + """ + + model_attr_name = "process" + process: Process + + # States + queued = State(value=Process.StatusChoices.QUEUED, initial=True) + running = State(value=Process.StatusChoices.RUNNING) + exited = State(value=Process.StatusChoices.EXITED, final=True) + + # Tick Event - transitions based on conditions + tick = ( + queued.to.itself(unless="can_start") + | queued.to(running, cond="can_start") + | running.to.itself(unless="is_exited") + | running.to(exited, cond="is_exited") + ) + + # Additional events (for explicit control) + launch = queued.to(running) + kill = running.to(exited) + + def can_start(self) -> bool: + """Check if process can start (has cmd and machine).""" + return bool(self.process.cmd and self.process.machine) + + def is_exited(self) -> bool: + """Check if process has exited (exit_code is set).""" + return self.process.exit_code is not None + + @queued.enter + def enter_queued(self): + """Process is queued for execution.""" + self.process.update_and_requeue( + retry_at=timezone.now(), + status=Process.StatusChoices.QUEUED, + ) + + @running.enter + def enter_running(self): + """Start process execution.""" + # Lock the process while it runs + self.process.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=self.process.timeout), + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + + # Launch the subprocess + # NOTE: This is a placeholder - actual launch logic would + # be implemented based on how hooks currently spawn processes + # For now, Process is a data model that tracks execution metadata + # The actual subprocess spawning is still handled by run_hook() + + # Mark as immediately exited for now (until we refactor run_hook) + # In the future, this would actually spawn the subprocess + self.process.exit_code = 0 # Placeholder + self.process.save() + + @exited.enter + def enter_exited(self): + """Process has exited.""" + self.process.update_and_requeue( + retry_at=None, + status=Process.StatusChoices.EXITED, + ended_at=timezone.now(), + ) + + +# ============================================================================= +# State Machine Registration +# ============================================================================= + +# Manually register state machines with python-statemachine registry +registry.register(BinaryMachine) +registry.register(ProcessMachine) diff --git a/archivebox/machine/tests/__init__.py b/archivebox/machine/tests/__init__.py new file mode 100644 index 0000000000..d7ce160be3 --- /dev/null +++ b/archivebox/machine/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the machine module (Machine, NetworkInterface, Binary, Process models).""" diff --git a/archivebox/main.py b/archivebox/main.py deleted file mode 100644 index 0107bac0d4..0000000000 --- a/archivebox/main.py +++ /dev/null @@ -1,1306 +0,0 @@ -__package__ = 'archivebox' - -import os -import sys -import shutil -import platform -from pathlib import Path -from datetime import date - -from typing import Dict, List, Optional, Iterable, IO, Union -from crontab import CronTab, CronSlices -from django.db.models import QuerySet - -from .cli import ( - list_subcommands, - run_subcommand, - display_first, - meta_cmds, - main_cmds, - archive_cmds, -) -from .parsers import ( - save_text_as_source, - save_file_as_source, - parse_links_memory, -) -from .index.schema import Link -from .util import enforce_types # type: ignore -from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT -from .system import run as run_shell -from .index import ( - load_main_index, - parse_links_from_source, - dedupe_links, - write_main_index, - snapshot_filter, - get_indexed_folders, - get_archived_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_invalid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, - fix_invalid_folder_locations, - write_link_details, -) -from .index.json import ( - parse_json_main_index, - parse_json_links_details, - generate_json_index_from_links, -) -from .index.sql import ( - get_admins, - apply_migrations, - remove_from_sql_main_index, -) -from .index.html import ( - generate_index_from_links, -) -from .index.csv import links_to_csv -from .extractors import archive_links, archive_link, ignore_methods -from .config import ( - stderr, - hint, - ConfigDict, - ANSI, - IS_TTY, - DEBUG, - IN_DOCKER, - USER, - PYTHON_BINARY, - ARCHIVEBOX_BINARY, - ONLY_NEW, - OUTPUT_DIR, - SOURCES_DIR, - ARCHIVE_DIR, - LOGS_DIR, - PACKAGE_DIR, - CONFIG_FILE, - ARCHIVE_DIR_NAME, - JSON_INDEX_FILENAME, - HTML_INDEX_FILENAME, - SQL_INDEX_FILENAME, - ALLOWED_IN_OUTPUT_DIR, - SEARCH_BACKEND_ENGINE, - check_dependencies, - check_data_folder, - write_config_file, - VERSION, - CODE_LOCATIONS, - EXTERNAL_LOCATIONS, - DATA_LOCATIONS, - DEPENDENCIES, - USE_CHROME, - CHROME_BINARY, - CHROME_VERSION, - YOUTUBEDL_BINARY, - YOUTUBEDL_VERSION, - SINGLEFILE_VERSION, - READABILITY_VERSION, - MERCURY_VERSION, - USE_YOUTUBEDL, - USE_NODE, - NODE_VERSION, - load_all_config, - CONFIG, - USER_CONFIG, - get_real_name, - setup_django, -) -from .logging_util import ( - TERM_WIDTH, - TimedProgress, - log_importing_started, - log_crawl_started, - log_removal_started, - log_removal_finished, - log_list_started, - log_list_finished, - printable_config, - printable_folders, - printable_filesize, - printable_folder_status, - printable_dependency_version, -) - -from .search import flush_search_index, index_links - - - -@enforce_types -def help(out_dir: Path=OUTPUT_DIR) -> None: - """Print the ArchiveBox help message and usage""" - - all_subcommands = list_subcommands() - COMMANDS_HELP_TEXT = '\n '.join( - f'{cmd.ljust(20)} {summary}' - for cmd, summary in all_subcommands.items() - if cmd in meta_cmds - ) + '\n\n ' + '\n '.join( - f'{cmd.ljust(20)} {summary}' - for cmd, summary in all_subcommands.items() - if cmd in main_cmds - ) + '\n\n ' + '\n '.join( - f'{cmd.ljust(20)} {summary}' - for cmd, summary in all_subcommands.items() - if cmd in archive_cmds - ) + '\n\n ' + '\n '.join( - f'{cmd.ljust(20)} {summary}' - for cmd, summary in all_subcommands.items() - if cmd not in display_first - ) - - - if (Path(out_dir) / SQL_INDEX_FILENAME).exists(): - print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset} - -{lightred}Active data directory:{reset} - {} - -{lightred}Usage:{reset} - archivebox [command] [--help] [--version] [...args] - -{lightred}Commands:{reset} - {} - -{lightred}Example Use:{reset} - mkdir my-archive; cd my-archive/ - archivebox init - archivebox status - - archivebox add https://example.com/some/page - archivebox add --depth=1 ~/Downloads/bookmarks_export.html - - archivebox list --sort=timestamp --csv=timestamp,url,is_archived - archivebox schedule --every=day https://example.com/some/feed.rss - archivebox update --resume=15109948213.123 - -{lightred}Documentation:{reset} - https://github.com/ArchiveBox/ArchiveBox/wiki -'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI)) - - else: - print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI)) - print() - if IN_DOCKER: - print('When using Docker, you need to mount a volume to use as your data dir:') - print(' docker run -v /some/path:/data archivebox ...') - print() - print('To import an existing archive (from a previous version of ArchiveBox):') - print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:') - print(' 2. archivebox init') - print() - print('To start a new archive:') - print(' 1. Create an empty directory, then cd into it and run:') - print(' 2. archivebox init') - print() - print('For more information, see the documentation here:') - print(' https://github.com/ArchiveBox/ArchiveBox/wiki') - - -@enforce_types -def version(quiet: bool=False, - out_dir: Path=OUTPUT_DIR) -> None: - """Print the ArchiveBox version and dependency information""" - - if quiet: - print(VERSION) - else: - # ArchiveBox v0.5.6 - # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY) - print('ArchiveBox v{}'.format(VERSION)) - p = platform.uname() - print( - sys.implementation.name.title(), - p.system, - platform.platform(), - p.machine, - ) - print( - f'IN_DOCKER={IN_DOCKER}', - f'DEBUG={DEBUG}', - f'IS_TTY={IS_TTY}', - f'TZ={os.environ.get("TZ", "UTC")}', - f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}', - ) - print() - - print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) - for name, dependency in DEPENDENCIES.items(): - print(printable_dependency_version(name, dependency)) - - print() - print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) - for name, folder in CODE_LOCATIONS.items(): - print(printable_folder_status(name, folder)) - - print() - print('{white}[i] Secrets locations:{reset}'.format(**ANSI)) - for name, folder in EXTERNAL_LOCATIONS.items(): - print(printable_folder_status(name, folder)) - - print() - if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']: - print('{white}[i] Data locations:{reset}'.format(**ANSI)) - for name, folder in DATA_LOCATIONS.items(): - print(printable_folder_status(name, folder)) - else: - print() - print('{white}[i] Data locations:{reset}'.format(**ANSI)) - - print() - check_dependencies() - - -@enforce_types -def run(subcommand: str, - subcommand_args: Optional[List[str]], - stdin: Optional[IO]=None, - out_dir: Path=OUTPUT_DIR) -> None: - """Run a given ArchiveBox subcommand with the given list of args""" - run_subcommand( - subcommand=subcommand, - subcommand_args=subcommand_args, - stdin=stdin, - pwd=out_dir, - ) - - -@enforce_types -def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=OUTPUT_DIR) -> None: - """Initialize a new ArchiveBox collection in the current directory""" - - from core.models import Snapshot - - out_dir.mkdir(exist_ok=True) - is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) - - if (out_dir / JSON_INDEX_FILENAME).exists(): - stderr("[!] This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.", color="lightyellow") - stderr(" You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.", color="lightyellow") - - existing_index = (out_dir / SQL_INDEX_FILENAME).exists() - - if is_empty and not existing_index: - print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI)) - print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) - elif existing_index: - # TODO: properly detect and print the existing version in current index as well - print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI)) - print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) - else: - if force: - stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow') - stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).') - else: - stderr( - ("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n" - " You must run init in a completely empty directory, or an existing data folder.\n\n" - " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n" - " then run and run 'archivebox init' to pick up where you left off.\n\n" - " (Always make sure your data folder is backed up first before updating ArchiveBox)" - ).format(out_dir, **ANSI) - ) - raise SystemExit(2) - - if existing_index: - print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI)) - else: - print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI)) - - print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...') - Path(SOURCES_DIR).mkdir(exist_ok=True) - Path(ARCHIVE_DIR).mkdir(exist_ok=True) - Path(LOGS_DIR).mkdir(exist_ok=True) - print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...') - write_config_file({}, out_dir=out_dir) - - if (out_dir / SQL_INDEX_FILENAME).exists(): - print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI)) - else: - print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI)) - - DATABASE_FILE = out_dir / SQL_INDEX_FILENAME - for migration_line in apply_migrations(out_dir): - print(f' {migration_line}') - - assert DATABASE_FILE.exists() - print() - print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}') - - # from django.contrib.auth.models import User - # if IS_TTY and not User.objects.filter(is_superuser=True).exists(): - # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI)) - # call_command("createsuperuser", interactive=True) - - print() - print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI)) - - all_links = Snapshot.objects.none() - pending_links: Dict[str, Link] = {} - - if existing_index: - all_links = load_main_index(out_dir=out_dir, warn=False) - print(' √ Loaded {} links from existing main index.'.format(all_links.count())) - - if quick: - print(' > Skipping full snapshot directory check (quick mode)') - else: - try: - # Links in data folders that dont match their timestamp - fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir) - if fixed: - print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI)) - if cant_fix: - print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI)) - - # Links in JSON index but not in main index - orphaned_json_links = { - link.url: link - for link in parse_json_main_index(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_json_links: - pending_links.update(orphaned_json_links) - print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI)) - - # Links in data dir indexes but not in main index - orphaned_data_dir_links = { - link.url: link - for link in parse_json_links_details(out_dir) - if not all_links.filter(url=link.url).exists() - } - if orphaned_data_dir_links: - pending_links.update(orphaned_data_dir_links) - print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI)) - - # Links in invalid/duplicate data dirs - invalid_folders = { - folder: link - for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items() - } - if invalid_folders: - print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI)) - print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items())) - print() - print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI)) - print(' archivebox status') - print(' archivebox list --status=invalid') - - except (KeyboardInterrupt, SystemExit): - stderr() - stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red') - stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.') - stderr() - stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI)) - stderr(' archivebox init --quick') - raise SystemExit(1) - - write_main_index(list(pending_links.values()), out_dir=out_dir) - - print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) - if existing_index: - print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) - else: - print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI)) - - json_index = out_dir / JSON_INDEX_FILENAME - html_index = out_dir / HTML_INDEX_FILENAME - index_name = f"{date.today()}_index_old" - if json_index.exists(): - json_index.rename(f"{index_name}.json") - if html_index.exists(): - html_index.rename(f"{index_name}.html") - - if setup: - run_subcommand('setup', pwd=out_dir) - - if Snapshot.objects.count() < 25: # hide the hints for experienced users - print() - print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI)) - print(' archivebox server # then visit http://127.0.0.1:8000') - print() - print(' To add new links, you can run:') - print(" archivebox add ~/some/path/or/url/to/list_of_links.txt") - print() - print(' For more usage and examples, run:') - print(' archivebox help') - -@enforce_types -def status(out_dir: Path=OUTPUT_DIR) -> None: - """Print out some info and statistics about the archive collection""" - - check_data_folder(out_dir=out_dir) - - from core.models import Snapshot - from django.contrib.auth import get_user_model - User = get_user_model() - - print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI)) - print(ANSI['lightyellow'], f' {out_dir}/*', ANSI['reset']) - num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.') - size = printable_filesize(num_bytes) - print(f' Index size: {size} across {num_files} files') - print() - - links = load_main_index(out_dir=out_dir) - num_sql_links = links.count() - num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir)) - print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})') - print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)') - print() - print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI)) - print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset']) - num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR) - size = printable_filesize(num_bytes) - print(f' Size: {size} across {num_files} files in {num_dirs} directories') - print(ANSI['black']) - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) - print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') - - print(ANSI['reset']) - - if num_indexed: - print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI)) - print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)') - - if orphaned: - print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI)) - print(' archivebox init') - - if num_invalid: - print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI)) - print(' archivebox init') - - print() - print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI)) - print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset']) - users = get_admins().values_list('username', flat=True) - print(f' UI users {len(users)}: {", ".join(users)}') - last_login = User.objects.order_by('last_login').last() - if last_login: - print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}') - last_updated = Snapshot.objects.order_by('updated').last() - if last_updated: - print(f' Last changes: {str(last_updated.updated)[:16]}') - - if not users: - print() - print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI)) - print(' archivebox manage createsuperuser') - - print() - for snapshot in links.order_by('-updated')[:10]: - if not snapshot.updated: - continue - print( - ANSI['black'], - ( - f' > {str(snapshot.updated)[:16]} ' - f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' - f'"{snapshot.title}": {snapshot.url}' - )[:TERM_WIDTH()], - ANSI['reset'], - ) - print(ANSI['black'], ' ...', ANSI['reset']) - - -@enforce_types -def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR): - """ - Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. - You can run this to archive single pages without needing to create a whole collection with archivebox init. - """ - oneshot_link, _ = parse_links_memory([url]) - if len(oneshot_link) > 1: - stderr( - '[X] You should pass a single url to the oneshot command', - color='red' - ) - raise SystemExit(2) - - methods = extractors.split(",") if extractors else ignore_methods(['title']) - archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) - return oneshot_link - -@enforce_types -def add(urls: Union[str, List[str]], - tag: str='', - depth: int=0, - update_all: bool=not ONLY_NEW, - index_only: bool=False, - overwrite: bool=False, - # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically - init: bool=False, - extractors: str="", - parser: str="auto", - out_dir: Path=OUTPUT_DIR) -> List[Link]: - """Add a new URL or list of URLs to your archive""" - - from core.models import Tag - - assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' - - extractors = extractors.split(",") if extractors else [] - - if init: - run_subcommand('init', stdin=None, pwd=out_dir) - - # Load list of links from the existing index - check_data_folder(out_dir=out_dir) - check_dependencies() - new_links: List[Link] = [] - all_links = load_main_index(out_dir=out_dir) - - log_importing_started(urls=urls, depth=depth, index_only=index_only) - if isinstance(urls, str): - # save verbatim stdin to sources - write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) - elif isinstance(urls, list): - # save verbatim args to sources - write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) - - new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) - - # If we're going one level deeper, download each link and look for more links - new_links_depth = [] - if new_links and depth == 1: - log_crawl_started(new_links) - for new_link in new_links: - downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) - new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) - - imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) - - new_links = dedupe_links(all_links, imported_links) - - write_main_index(links=new_links, out_dir=out_dir) - all_links = load_main_index(out_dir=out_dir) - - if index_only: - # mock archive all the links using the fake index_only extractor method in order to update their state - if overwrite: - archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir) - else: - archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir) - else: - # fully run the archive extractor methods for each link - archive_kwargs = { - "out_dir": out_dir, - } - if extractors: - archive_kwargs["methods"] = extractors - - if update_all: - archive_links(all_links, overwrite=overwrite, **archive_kwargs) - elif overwrite: - archive_links(imported_links, overwrite=True, **archive_kwargs) - elif new_links: - archive_links(new_links, overwrite=False, **archive_kwargs) - - - # add any tags to imported links - tags = [ - Tag.objects.get_or_create(name=name.strip())[0] - for name in tag.split(',') - if name.strip() - ] - if tags: - for link in imported_links: - snapshot = link.as_snapshot() - snapshot.tags.add(*tags) - snapshot.tags_str(nocache=True) - snapshot.save() - # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}') - - - return all_links - -@enforce_types -def remove(filter_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - snapshots: Optional[QuerySet]=None, - after: Optional[float]=None, - before: Optional[float]=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=OUTPUT_DIR) -> List[Link]: - """Remove the specified URLs from the archive""" - - check_data_folder(out_dir=out_dir) - - if snapshots is None: - if filter_str and filter_patterns: - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif not (filter_str or filter_patterns): - stderr( - '[X] You should pass either a pattern as an argument, ' - 'or pass a list of patterns via stdin.', - color='red', - ) - stderr() - hint(('To remove all urls you can run:', - 'archivebox remove --filter-type=regex ".*"')) - stderr() - raise SystemExit(2) - elif filter_str: - filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] - - list_kwargs = { - "filter_patterns": filter_patterns, - "filter_type": filter_type, - "after": after, - "before": before, - } - if snapshots: - list_kwargs["snapshots"] = snapshots - - log_list_started(filter_patterns, filter_type) - timer = TimedProgress(360, prefix=' ') - try: - snapshots = list_links(**list_kwargs) - finally: - timer.end() - - - if not snapshots.exists(): - log_removal_finished(0, 0) - raise SystemExit(1) - - - log_links = [link.as_link() for link in snapshots] - log_list_finished(log_links) - log_removal_started(log_links, yes=yes, delete=delete) - - timer = TimedProgress(360, prefix=' ') - try: - for snapshot in snapshots: - if delete: - shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True) - finally: - timer.end() - - to_remove = snapshots.count() - - flush_search_index(snapshots=snapshots) - remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) - all_snapshots = load_main_index(out_dir=out_dir) - log_removal_finished(all_snapshots.count(), to_remove) - - return all_snapshots - -@enforce_types -def update(resume: Optional[float]=None, - only_new: bool=ONLY_NEW, - index_only: bool=False, - overwrite: bool=False, - filter_patterns_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: Optional[str]=None, - status: Optional[str]=None, - after: Optional[str]=None, - before: Optional[str]=None, - extractors: str="", - out_dir: Path=OUTPUT_DIR) -> List[Link]: - """Import any new links from subscriptions and retry any previously failed/skipped links""" - - check_data_folder(out_dir=out_dir) - check_dependencies() - new_links: List[Link] = [] # TODO: Remove input argument: only_new - - extractors = extractors.split(",") if extractors else [] - - # Step 1: Filter for selected_links - matching_snapshots = list_links( - filter_patterns=filter_patterns, - filter_type=filter_type, - before=before, - after=after, - ) - - matching_folders = list_folders( - links=matching_snapshots, - status=status, - out_dir=out_dir, - ) - all_links = [link for link in matching_folders.values() if link] - - if index_only: - for link in all_links: - write_link_details(link, out_dir=out_dir, skip_sql_index=True) - index_links(all_links, out_dir=out_dir) - return all_links - - # Step 2: Run the archive methods for each link - to_archive = new_links if only_new else all_links - if resume: - to_archive = [ - link for link in to_archive - if link.timestamp >= str(resume) - ] - if not to_archive: - stderr('') - stderr(f'[√] Nothing found to resume after {resume}', color='green') - return all_links - - archive_kwargs = { - "out_dir": out_dir, - } - if extractors: - archive_kwargs["methods"] = extractors - - archive_links(to_archive, overwrite=overwrite, **archive_kwargs) - - # Step 4: Re-write links index with updated titles, icons, and resources - all_links = load_main_index(out_dir=out_dir) - return all_links - -@enforce_types -def list_all(filter_patterns_str: Optional[str]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - status: Optional[str]=None, - after: Optional[float]=None, - before: Optional[float]=None, - sort: Optional[str]=None, - csv: Optional[str]=None, - json: bool=False, - html: bool=False, - with_headers: bool=False, - out_dir: Path=OUTPUT_DIR) -> Iterable[Link]: - """List, filter, and export information about archive entries""" - - check_data_folder(out_dir=out_dir) - - if filter_patterns and filter_patterns_str: - stderr( - '[X] You should either pass filter patterns as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif filter_patterns_str: - filter_patterns = filter_patterns_str.split('\n') - - snapshots = list_links( - filter_patterns=filter_patterns, - filter_type=filter_type, - before=before, - after=after, - ) - - if sort: - snapshots = snapshots.order_by(sort) - - folders = list_folders( - links=snapshots, - status=status, - out_dir=out_dir, - ) - - if json: - output = generate_json_index_from_links(folders.values(), with_headers) - elif html: - output = generate_index_from_links(folders.values(), with_headers) - elif csv: - output = links_to_csv(folders.values(), cols=csv.split(','), header=with_headers) - else: - output = printable_folders(folders, with_headers=with_headers) - print(output) - return folders - - -@enforce_types -def list_links(snapshots: Optional[QuerySet]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='exact', - after: Optional[float]=None, - before: Optional[float]=None, - out_dir: Path=OUTPUT_DIR) -> Iterable[Link]: - - check_data_folder(out_dir=out_dir) - - if snapshots: - all_snapshots = snapshots - else: - all_snapshots = load_main_index(out_dir=out_dir) - - if after is not None: - all_snapshots = all_snapshots.filter(timestamp__gte=after) - if before is not None: - all_snapshots = all_snapshots.filter(timestamp__lt=before) - if filter_patterns: - all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) - - if not all_snapshots: - stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') - - return all_snapshots - -@enforce_types -def list_folders(links: List[Link], - status: str, - out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]: - - check_data_folder(out_dir=out_dir) - - STATUS_FUNCTIONS = { - "indexed": get_indexed_folders, - "archived": get_archived_folders, - "unarchived": get_unarchived_folders, - "present": get_present_folders, - "valid": get_valid_folders, - "invalid": get_invalid_folders, - "duplicate": get_duplicate_folders, - "orphaned": get_orphaned_folders, - "corrupted": get_corrupted_folders, - "unrecognized": get_unrecognized_folders, - } - - try: - return STATUS_FUNCTIONS[status](links, out_dir=out_dir) - except KeyError: - raise ValueError('Status not recognized.') - -@enforce_types -def setup(out_dir: Path=OUTPUT_DIR) -> None: - """Automatically install all ArchiveBox dependencies and extras""" - - if not (out_dir / ARCHIVE_DIR_NAME).exists(): - run_subcommand('init', stdin=None, pwd=out_dir) - - setup_django(out_dir=out_dir, check_db=True) - from core.models import User - - if not User.objects.filter(is_superuser=True).exists(): - stderr('\n[+] Creating new admin user for the Web UI...', color='green') - run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) - - stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green') - - stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...') - if USE_YOUTUBEDL: - if YOUTUBEDL_VERSION: - print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY) - else: - try: - run_shell([ - PYTHON_BINARY, '-m', 'pip', - 'install', - '--upgrade', - '--no-cache-dir', - '--no-warn-script-location', - 'youtube_dl', - ], capture_output=False, cwd=out_dir) - pkg_path = run_shell([ - PYTHON_BINARY, '-m', 'pip', - 'show', - 'youtube_dl', - ], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0] - NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'youtube_dl' / '__main__.py' - os.chmod(NEW_YOUTUBEDL_BINARY, 0o777) - assert NEW_YOUTUBEDL_BINARY.exists(), f'youtube_dl must exist inside {pkg_path}' - config(f'YOUTUBEDL_BINARY={NEW_YOUTUBEDL_BINARY}', set=True, out_dir=out_dir) - except BaseException as e: - stderr(f'[X] Failed to install python packages: {e}', color='red') - raise SystemExit(1) - - stderr('\n Installing CHROME_BINARY automatically using playwright...') - if USE_CHROME: - if CHROME_VERSION: - print(f'{CHROME_VERSION} is already installed', CHROME_BINARY) - else: - try: - run_shell([ - PYTHON_BINARY, '-m', 'pip', - 'install', - '--upgrade', - '--no-cache-dir', - '--no-warn-script-location', - 'playwright', - ], capture_output=False, cwd=out_dir) - run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir) - proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir) - NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip() - assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path' - config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir) - except BaseException as e: - stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red') - raise SystemExit(1) - - stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...') - if USE_NODE: - if not NODE_VERSION: - stderr('[X] You must first install node using your system package manager', color='red') - hint([ - 'curl -sL https://deb.nodesource.com/setup_15.x | sudo -E bash -', - 'or to disable all node-based modules run: archivebox config --set USE_NODE=False', - ]) - raise SystemExit(1) - - if all((SINGLEFILE_VERSION, READABILITY_VERSION, MERCURY_VERSION)): - print('SINGLEFILE_BINARY, READABILITY_BINARY, and MERCURURY_BINARY are already installed') - else: - try: - # clear out old npm package locations - paths = ( - out_dir / 'package.json', - out_dir / 'package_lock.json', - out_dir / 'node_modules', - ) - for path in paths: - if path.is_dir(): - shutil.rmtree(path, ignore_errors=True) - elif path.is_file(): - os.remove(path) - - shutil.copyfile(PACKAGE_DIR / 'package.json', out_dir / 'package.json') - run_shell([ - 'npm', - 'install', - '--prefix', str(out_dir), - '--force', - '--no-save', - '--no-audit', - '--no-fund', - '--loglevel', 'error', - ], capture_output=False, cwd=out_dir) - os.remove(out_dir / 'package.json') - except BaseException as e: - stderr(f'[X] Failed to install npm packages: {e}', color='red') - hint(f'Try deleting {out_dir}/node_modules and running it again') - raise SystemExit(1) - - stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') - - run_shell([ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir) - -@enforce_types -def config(config_options_str: Optional[str]=None, - config_options: Optional[List[str]]=None, - get: bool=False, - set: bool=False, - reset: bool=False, - out_dir: Path=OUTPUT_DIR) -> None: - """Get and set your ArchiveBox project configuration values""" - - check_data_folder(out_dir=out_dir) - - if config_options and config_options_str: - stderr( - '[X] You should either pass config values as an arguments ' - 'or via stdin, but not both.\n', - color='red', - ) - raise SystemExit(2) - elif config_options_str: - config_options = config_options_str.split('\n') - - config_options = config_options or [] - - no_args = not (get or set or reset or config_options) - - matching_config: ConfigDict = {} - if get or no_args: - if config_options: - config_options = [get_real_name(key) for key in config_options] - matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG} - failed_config = [key for key in config_options if key not in CONFIG] - if failed_config: - stderr() - stderr('[X] These options failed to get', color='red') - stderr(' {}'.format('\n '.join(config_options))) - raise SystemExit(1) - else: - matching_config = CONFIG - - print(printable_config(matching_config)) - raise SystemExit(not matching_config) - elif set: - new_config = {} - failed_options = [] - for line in config_options: - if line.startswith('#') or not line.strip(): - continue - if '=' not in line: - stderr('[X] Config KEY=VALUE must have an = sign in it', color='red') - stderr(f' {line}') - raise SystemExit(2) - - raw_key, val = line.split('=', 1) - raw_key = raw_key.upper().strip() - key = get_real_name(raw_key) - if key != raw_key: - stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow') - - if key in CONFIG: - new_config[key] = val.strip() - else: - failed_options.append(line) - - if new_config: - before = CONFIG - matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR) - after = load_all_config() - print(printable_config(matching_config)) - - side_effect_changes: ConfigDict = {} - for key, val in after.items(): - if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config): - side_effect_changes[key] = after[key] - - if side_effect_changes: - stderr() - stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow') - print(' {}'.format(printable_config(side_effect_changes, prefix=' '))) - if failed_options: - stderr() - stderr('[X] These options failed to set (check for typos):', color='red') - stderr(' {}'.format('\n '.join(failed_options))) - raise SystemExit(1) - elif reset: - stderr('[X] This command is not implemented yet.', color='red') - stderr(' Please manually remove the relevant lines from your config file:') - stderr(f' {CONFIG_FILE}') - raise SystemExit(2) - else: - stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red') - stderr(' archivebox config') - stderr(' archivebox config --get SOME_KEY') - stderr(' archivebox config --set SOME_KEY=SOME_VALUE') - raise SystemExit(2) - - -@enforce_types -def schedule(add: bool=False, - show: bool=False, - clear: bool=False, - foreground: bool=False, - run_all: bool=False, - quiet: bool=False, - every: Optional[str]=None, - depth: int=0, - overwrite: bool=False, - import_path: Optional[str]=None, - out_dir: Path=OUTPUT_DIR): - """Set ArchiveBox to regularly import URLs at specific times using cron""" - - check_data_folder(out_dir=out_dir) - - Path(LOGS_DIR).mkdir(exist_ok=True) - - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - - if clear: - print(cron.remove_all(comment=CRON_COMMENT)) - cron.write() - raise SystemExit(0) - - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - if every or add: - every = every or 'day' - quoted = lambda s: f'"{s}"' if (s and ' ' in str(s)) else str(s) - cmd = [ - 'cd', - quoted(out_dir), - '&&', - quoted(ARCHIVEBOX_BINARY), - *([ - 'add', - *(['--overwrite'] if overwrite else []), - f'--depth={depth}', - f'"{import_path}"', - ] if import_path else ['update']), - '>>', - quoted(Path(LOGS_DIR) / 'schedule.log'), - '2>&1', - - ] - new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT) - - if every in ('minute', 'hour', 'day', 'month', 'year'): - set_every = getattr(new_job.every(), every) - set_every() - elif CronSlices.is_valid(every): - new_job.setall(every) - else: - stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI)) - stderr(' It must be one of minute/hour/day/month') - stderr(' or a quoted cron-format schedule like:') - stderr(' archivebox init --every=day --depth=1 https://example.com/some/rss/feed.xml') - stderr(' archivebox init --every="0/5 * * * *" --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(1) - - cron = dedupe_cron_jobs(cron) - cron.write() - - total_runs = sum(j.frequency_per_year() for j in cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI)) - print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) - if total_runs > 60 and not quiet: - stderr() - stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI)) - stderr(' Congrats on being an enthusiastic internet archiver! 👌') - stderr() - stderr(' Make sure you have enough storage space available to hold all the data.') - stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.') - stderr('') - elif show: - if existing_jobs: - print('\n'.join(str(cmd) for cmd in existing_jobs)) - else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI)) - stderr(' To schedule a new job, run:') - stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(0) - - cron = CronTab(user=True) - cron = dedupe_cron_jobs(cron) - existing_jobs = list(cron.find_comment(CRON_COMMENT)) - - if foreground or run_all: - if not existing_jobs: - stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI)) - stderr(' archivebox schedule --every=hour --depth=1 https://example.com/some/rss/feed.xml') - raise SystemExit(1) - - print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI)) - if run_all: - try: - for job in existing_jobs: - sys.stdout.write(f' > {job.command.split("/archivebox ")[0].split(" && ")[0]}\n') - sys.stdout.write(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') - sys.stdout.flush() - job.run() - sys.stdout.write(f'\r √ {job.command.split("/archivebox ")[-1]}\n') - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) - raise SystemExit(1) - - if foreground: - try: - for job in existing_jobs: - print(f' > {job.command.split("/archivebox ")[-1].split(" >> ")[0]}') - for result in cron.run_scheduler(): - print(result) - except KeyboardInterrupt: - print('\n{green}[√] Stopped.{reset}'.format(**ANSI)) - raise SystemExit(1) - - -@enforce_types -def server(runserver_args: Optional[List[str]]=None, - reload: bool=False, - debug: bool=False, - init: bool=False, - quick_init: bool=False, - createsuperuser: bool=False, - out_dir: Path=OUTPUT_DIR) -> None: - """Run the ArchiveBox HTTP server""" - - runserver_args = runserver_args or [] - - if init: - run_subcommand('init', stdin=None, pwd=out_dir) - print() - elif quick_init: - run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir) - print() - - if createsuperuser: - run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) - print() - - # setup config for django runserver - from . import config - config.SHOW_PROGRESS = False - config.DEBUG = config.DEBUG or debug - - check_data_folder(out_dir=out_dir) - - from django.core.management import call_command - from django.contrib.auth.models import User - - print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI)) - print(' > Logging errors to ./logs/errors.log') - if not User.objects.filter(is_superuser=True).exists(): - print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI)) - print() - print(' To create an admin user, run:') - print(' archivebox manage createsuperuser') - print() - - # fallback to serving staticfiles insecurely with django when DEBUG=False - if not config.DEBUG: - runserver_args.append('--insecure') # TODO: serve statics w/ nginx instead - - # toggle autoreloading when archivebox code changes (it's on by default) - if not reload: - runserver_args.append('--noreload') - - config.SHOW_PROGRESS = False - config.DEBUG = config.DEBUG or debug - - call_command("runserver", *runserver_args) - - -@enforce_types -def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None: - """Run an ArchiveBox Django management command""" - - check_data_folder(out_dir=out_dir) - from django.core.management import execute_from_command_line - - if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY): - stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') - stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') - stderr() - - execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])]) - - -@enforce_types -def shell(out_dir: Path=OUTPUT_DIR) -> None: - """Enter an interactive ArchiveBox Django shell""" - - check_data_folder(out_dir=out_dir) - - from django.core.management import call_command - call_command("shell_plus") - diff --git a/archivebox/manage.py b/archivebox/manage.py index 1a9b297569..ee4e8d7b53 100755 --- a/archivebox/manage.py +++ b/archivebox/manage.py @@ -2,28 +2,30 @@ import os import sys -if __name__ == '__main__': +if __name__ == "__main__": # if you're a developer working on archivebox, still prefer the archivebox # versions of ./manage.py commands whenever possible. When that's not possible # (e.g. makemigrations), you can comment out this check temporarily - if not ('makemigrations' in sys.argv or 'migrate' in sys.argv): + allowed_commands = ["makemigrations", "migrate", "startapp", "squashmigrations", "generate_stubs", "test"] + + if not any(cmd in sys.argv for cmd in allowed_commands): print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") print() - print(' Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:') - print(' archivebox init (migrates the databse to latest version)') - print(' archivebox server (runs the Django web server)') - print(' archivebox shell (opens an iPython Django shell with all models imported)') - print(' archivebox manage [cmd] (any other management commands)') + print(" Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:") + print(" archivebox init (migrates the database to latest version)") + print(" archivebox server (runs the Django web server)") + print(" archivebox shell (opens an iPython Django shell with all models imported)") + print(" archivebox manage [cmd] (any other management commands)") raise SystemExit(2) - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") try: from django.core.management import execute_from_command_line except ImportError as exc: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " - "forget to activate a virtual environment?" + "forget to activate a virtual environment?", ) from exc execute_from_command_line(sys.argv) diff --git a/archivebox/mcp/README.md b/archivebox/mcp/README.md new file mode 100644 index 0000000000..8b0aa42b08 --- /dev/null +++ b/archivebox/mcp/README.md @@ -0,0 +1,138 @@ +# ArchiveBox MCP Server + +Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents. + +## Overview + +This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata. + +## Features + +- ✅ **Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands +- ✅ **Zero duplication**: Reuses existing Click command definitions, types, and help text +- ✅ **Auto-sync**: Changes to CLI commands automatically reflected in MCP tools +- ✅ **Stateless**: No database models or state management required +- ✅ **Lightweight**: ~200 lines of code + +## Usage + +### Start the MCP Server + +```bash +archivebox mcp +``` + +The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout. + +### Example Client + +```python +import subprocess +import json + +# Start MCP server +proc = subprocess.Popen( + ['archivebox', 'mcp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True +) + +# Send initialize request +request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}} +proc.stdin.write(json.dumps(request) + '\n') +proc.stdin.flush() + +# Read response +response = json.loads(proc.stdout.readline()) +print(response) +``` + +### Example Requests + +**Initialize:** +```json +{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} +``` + +**List all available tools:** +```json +{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} +``` + +**Call a tool:** +```json +{ + "jsonrpc":"2.0", + "id":3, + "method":"tools/call", + "params":{ + "name":"version", + "arguments":{"quiet":true} + } +} +``` + +## Supported MCP Methods + +- `initialize` - Handshake and capability negotiation +- `tools/list` - List all available CLI commands as MCP tools +- `tools/call` - Execute a CLI command with arguments + +## Available Tools + +The server exposes all ArchiveBox CLI commands: + +**Meta**: `help`, `version`, `mcp` +**Setup**: `init`, `install` +**Archive**: `add`, `remove`, `update`, `search`, `status`, `config` +**Workers**: `orchestrator`, `worker` +**Tasks**: `crawl`, `snapshot`, `extract` +**Server**: `server`, `schedule` +**Utilities**: `shell`, `manage` + +## Architecture + +### Dynamic Introspection + +Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions: + +```python +# Auto-discover commands +from archivebox.cli import ArchiveBoxGroup +cli_group = ArchiveBoxGroup() +all_commands = cli_group.all_subcommands + +# Auto-generate schemas from Click metadata +for cmd_name in all_commands: + click_cmd = cli_group.get_command(None, cmd_name) + # Extract params, types, help text, etc. + tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd) +``` + +### Tool Execution + +Commands are executed using Click's `CliRunner`: + +```python +from click.testing import CliRunner + +runner = CliRunner() +result = runner.invoke(click_command, args) +``` + +## Files + +- `server.py` (~350 lines) - Core MCP server with Click introspection +- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point +- `apps.py`, `__init__.py` - Django app boilerplate + +## MCP Specification + +Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25). + +## Sources + +- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25) +- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol) +- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol) diff --git a/archivebox/mcp/__init__.py b/archivebox/mcp/__init__.py new file mode 100644 index 0000000000..dd4a67f328 --- /dev/null +++ b/archivebox/mcp/__init__.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.mcp" + +""" +Model Context Protocol (MCP) server for ArchiveBox. + +Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection. +Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox. +""" diff --git a/archivebox/mcp/apps.py b/archivebox/mcp/apps.py new file mode 100644 index 0000000000..3413e01b1a --- /dev/null +++ b/archivebox/mcp/apps.py @@ -0,0 +1,9 @@ +__package__ = "archivebox.mcp" + +from django.apps import AppConfig + + +class MCPConfig(AppConfig): + name = "mcp" + verbose_name = "Model Context Protocol Server" + default_auto_field = "django.db.models.BigAutoField" diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py new file mode 100644 index 0000000000..26196b7951 --- /dev/null +++ b/archivebox/mcp/server.py @@ -0,0 +1,402 @@ +""" +Model Context Protocol (MCP) server implementation for ArchiveBox. + +Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting +Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport. +""" + +import sys +import json +import traceback +from typing import Any + +import click +from click.testing import CliRunner + +from archivebox.config.version import VERSION + + +class MCPJSONEncoder(json.JSONEncoder): + """Custom JSON encoder that handles Click sentinel values and other special types""" + + def default(self, o): + # Handle Click's sentinel values + sentinel_type = getattr(click.core, "_SentinelClass", None) + if isinstance(sentinel_type, type) and isinstance(o, sentinel_type): + return None + + # Handle tuples (convert to lists) + if isinstance(o, tuple): + return list(o) + + # Handle any other non-serializable objects + try: + return super().default(o) + except TypeError: + return str(o) + + +# Type mapping from Click types to JSON Schema types +def click_type_to_json_schema_type(click_type: click.ParamType) -> dict[str, Any]: + """Convert a Click parameter type to JSON Schema type definition""" + + if isinstance(click_type, click.types.StringParamType): + return {"type": "string"} + elif isinstance(click_type, click.types.IntParamType): + return {"type": "integer"} + elif isinstance(click_type, click.types.FloatParamType): + return {"type": "number"} + elif isinstance(click_type, click.types.BoolParamType): + return {"type": "boolean"} + elif isinstance(click_type, click.types.Choice): + return {"type": "string", "enum": list(click_type.choices)} + elif isinstance(click_type, click.types.Path): + return {"type": "string", "description": "File or directory path"} + elif isinstance(click_type, click.types.File): + return {"type": "string", "description": "File path"} + elif isinstance(click_type, click.types.Tuple): + # Multiple arguments of same type + return {"type": "array", "items": {"type": "string"}} + else: + # Default to string for unknown types + return {"type": "string"} + + +def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict[str, Any]: + """ + Convert a Click command to an MCP tool definition with JSON Schema. + + Introspects the Click command's parameters to automatically generate + the input schema without manual definition. + """ + + properties: dict[str, dict[str, Any]] = {} + required: list[str] = [] + + # Extract parameters from Click command + for param in click_command.params: + # Skip internal parameters + if param.name is None or param.name in ("help", "version"): + continue + + param_schema = click_type_to_json_schema_type(param.type) + + # Add description from Click help text + help_text = getattr(param, "help", None) + if help_text: + param_schema["description"] = help_text + + # Handle default values + if param.default is not None and param.default != (): + param_schema["default"] = param.default + + # Handle multiple values (like multiple URLs) + if param.multiple: + properties[param.name] = { + "type": "array", + "items": param_schema, + "description": param_schema.get("description", f"Multiple {param.name} values"), + } + else: + properties[param.name] = param_schema + + # Mark as required if Click requires it + if param.required: + required.append(param.name) + + return { + "name": cmd_name, + "description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command", + "inputSchema": { + "type": "object", + "properties": properties, + "required": required, + }, + } + + +def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict: + """ + Execute a Click command programmatically with given arguments. + + Returns MCP-formatted result with captured output and error status. + """ + + # Setup Django for archive commands (commands that need database access) + from archivebox.cli import ArchiveBoxGroup + + if cmd_name in ArchiveBoxGroup.archive_commands: + try: + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + + setup_django() + check_data_folder() + except Exception as e: + # If Django setup fails, return error (unless it's manage/shell which handle this themselves) + if cmd_name not in ("manage", "shell"): + return { + "content": [ + { + "type": "text", + "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory.", + }, + ], + "isError": True, + } + + # Use Click's test runner to invoke command programmatically + runner = CliRunner() + + # Build a map of parameter names to their Click types (Argument vs Option) + param_map = {param.name: param for param in click_command.params} + + # Convert arguments dict to CLI args list + args = [] + positional_args = [] + + for key, value in arguments.items(): + param_name = key.replace("_", "-") # Click uses dashes + param = param_map.get(key) + + # Check if this is a positional Argument (not an Option) + is_argument = isinstance(param, click.Argument) + + if is_argument: + # Positional arguments - add them without dashes + if isinstance(value, list): + positional_args.extend([str(v) for v in value]) + elif value is not None: + positional_args.append(str(value)) + else: + # Options - add with dashes + if isinstance(value, bool): + if value: + args.append(f"--{param_name}") + elif isinstance(value, list): + # Multiple values for an option (rare) + for item in value: + args.append(f"--{param_name}") + args.append(str(item)) + elif value is not None: + args.append(f"--{param_name}") + args.append(str(value)) + + # Add positional arguments at the end + args.extend(positional_args) + + # Execute the command + try: + result = runner.invoke(click_command, args, catch_exceptions=False) + + # Format output as MCP content + content = [] + + if result.output: + content.append( + { + "type": "text", + "text": result.output, + }, + ) + + if result.stderr_bytes: + stderr_text = result.stderr_bytes.decode("utf-8", errors="replace") + if stderr_text.strip(): + content.append( + { + "type": "text", + "text": f"[stderr]\n{stderr_text}", + }, + ) + + # Check exit code + is_error = result.exit_code != 0 + + if is_error and not content: + content.append( + { + "type": "text", + "text": f"Command failed with exit code {result.exit_code}", + }, + ) + + return { + "content": content or [{"type": "text", "text": "(no output)"}], + "isError": is_error, + } + + except Exception as e: + # Capture any exceptions during execution + error_trace = traceback.format_exc() + return { + "content": [ + { + "type": "text", + "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}", + }, + ], + "isError": True, + } + + +class MCPServer: + """ + Model Context Protocol server for ArchiveBox. + + Provides JSON-RPC 2.0 interface over stdio, dynamically exposing + all Click commands as MCP tools. + """ + + def __init__(self): + # Import here to avoid circular imports + from archivebox.cli import ArchiveBoxGroup + + self.cli_group = ArchiveBoxGroup() + self.protocol_version = "2025-11-25" + self._tool_cache = {} # Cache loaded Click commands + + def get_click_command(self, cmd_name: str) -> click.Command | None: + """Get a Click command by name, with caching""" + if cmd_name not in self._tool_cache: + if cmd_name not in self.cli_group.all_subcommands: + return None + self._tool_cache[cmd_name] = self.cli_group.get_command(click.Context(self.cli_group), cmd_name) + return self._tool_cache[cmd_name] + + def handle_initialize(self, params: dict) -> dict: + """Handle MCP initialize request""" + return { + "protocolVersion": self.protocol_version, + "capabilities": { + "tools": {}, + }, + "serverInfo": { + "name": "archivebox-mcp", + "version": VERSION, + }, + } + + def handle_tools_list(self, params: dict) -> dict: + """Handle MCP tools/list request - returns all available CLI commands as tools""" + tools = [] + + for cmd_name in self.cli_group.all_subcommands.keys(): + click_cmd = self.get_click_command(cmd_name) + if click_cmd: + try: + tool_def = click_command_to_mcp_tool(cmd_name, click_cmd) + tools.append(tool_def) + except Exception as e: + # Log but don't fail - skip problematic commands + print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr) + + return {"tools": tools} + + def handle_tools_call(self, params: dict) -> dict: + """Handle MCP tools/call request - executes a CLI command""" + tool_name = params.get("name") + arguments = params.get("arguments", {}) + + if not tool_name: + raise ValueError("Missing required parameter: name") + + click_cmd = self.get_click_command(tool_name) + if not click_cmd: + raise ValueError(f"Unknown tool: {tool_name}") + + # Execute the command and return MCP-formatted result + return execute_click_command(tool_name, click_cmd, arguments) + + def handle_request(self, request: dict) -> dict: + """ + Handle a JSON-RPC 2.0 request and return response. + + Supports MCP methods: initialize, tools/list, tools/call + """ + + method = request.get("method") + params = request.get("params", {}) + request_id = request.get("id") + + try: + # Route to appropriate handler + if method == "initialize": + result = self.handle_initialize(params) + elif method == "tools/list": + result = self.handle_tools_list(params) + elif method == "tools/call": + result = self.handle_tools_call(params) + else: + # Method not found + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32601, + "message": f"Method not found: {method}", + }, + } + + # Success response + return { + "jsonrpc": "2.0", + "id": request_id, + "result": result, + } + + except Exception as e: + # Error response + error_trace = traceback.format_exc() + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32603, + "message": str(e), + "data": error_trace, + }, + } + + def run_stdio_server(self): + """ + Run the MCP server in stdio mode. + + Reads JSON-RPC requests from stdin (one per line), + writes JSON-RPC responses to stdout (one per line). + """ + + # Read requests from stdin line by line + for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + # Parse JSON-RPC request + request = json.loads(line) + + # Handle request + response = self.handle_request(request) + + # Write response to stdout (use custom encoder for Click types) + print(json.dumps(response, cls=MCPJSONEncoder), flush=True) + + except json.JSONDecodeError as e: + # Invalid JSON + error_response = { + "jsonrpc": "2.0", + "id": None, + "error": { + "code": -32700, + "message": "Parse error", + "data": str(e), + }, + } + print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True) + + +def run_mcp_server(): + """Main entry point for MCP server""" + server = MCPServer() + server.run_stdio_server() diff --git a/archivebox/misc/__init__.py b/archivebox/misc/__init__.py new file mode 100644 index 0000000000..1619d0560c --- /dev/null +++ b/archivebox/misc/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.misc" diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py new file mode 100644 index 0000000000..46444662f5 --- /dev/null +++ b/archivebox/misc/checks.py @@ -0,0 +1,317 @@ +__package__ = "archivebox.misc" + +import os +import sys +from pathlib import Path + +from rich import print +from rich.panel import Panel + +# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries +# this file is imported by archivebox/__init__.py +# and any imports here will be imported by EVERYTHING else +# so this file should only be used for pure python checks +# that don't need to import other parts of ArchiveBox + +# if a check needs to import other parts of ArchiveBox, +# the imports should be done inside the check function +# and you should make sure if you need to import any django stuff +# that the check is called after django.setup() has been called + + +def check_data_folder() -> None: + from archivebox import DATA_DIR, ARCHIVE_DIR + from archivebox.config import CONSTANTS + from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + archive_dir_exists = os.path.isdir(ARCHIVE_DIR) + if not archive_dir_exists: + print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr) + print(f" {DATA_DIR}", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr) + print(" cd path/to/your/archive/folder", file=sys.stderr) + print(" archivebox [command]", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr) + print(" archivebox init", file=sys.stderr) + raise SystemExit(2) + + # Create data dir subdirs + create_and_chown_dir(CONSTANTS.SOURCES_DIR) + create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default") + create_and_chown_dir(CONSTANTS.LOGS_DIR) + # create_and_chown_dir(CONSTANTS.CACHE_DIR) + + # Create /tmp and /lib dirs if they don't exist + get_or_create_working_tmp_dir(autofix=True, quiet=False) + get_or_create_working_lib_dir(autofix=True, quiet=False) + + # Check data dir permissions, /tmp, and /lib permissions + check_data_dir_permissions() + + +def check_migrations(): + from archivebox import DATA_DIR + from archivebox.misc.db import list_migrations + + pending_migrations = [name for status, name in list_migrations() if not status] + is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"]) + + if pending_migrations and not is_migrating: + print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]") + print(f" {DATA_DIR}", file=sys.stderr) + print(file=sys.stderr) + print( + f" [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:", + file=sys.stderr, + ) + print(" archivebox init", file=sys.stderr) + raise SystemExit(3) + + +def check_io_encoding(): + PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8") + + if PYTHON_ENCODING != "UTF-8": + print( + f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]", + file=sys.stderr, + ) + print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr) + print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr) + print("") + print(" Confirm that it's fixed by opening a new shell and running:", file=sys.stderr) + print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr) + raise SystemExit(2) + + # # hard errors: check python version + # if sys.version_info[:3] < (3, 10, 0): + # print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr) + # print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr) + # raise SystemExit(2) + + # # hard errors: check django version + # if int(django.VERSION[0]) < 5: + # print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr) + # print(' Upgrade django using pip or your system package manager: pip3 install --upgrade django', file=sys.stderr) + # raise SystemExit(2) + + +def check_not_root(): + from archivebox.config.permissions import IS_ROOT, IN_DOCKER + + attempted_command = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else "" + is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv + is_getting_version = "--version" in sys.argv or "version" in sys.argv + is_installing = "setup" in sys.argv or "install" in sys.argv + + if IS_ROOT and not (is_getting_help or is_getting_version or is_installing): + print("[red][!] ArchiveBox should never be run as root![/red]", file=sys.stderr) + print(" For more information, see the security overview documentation:", file=sys.stderr) + print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr) + + if IN_DOCKER: + print( + "[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:", + file=sys.stderr, + ) + print(" docker compose run archivebox {attempted_command}", file=sys.stderr) + print(f" docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}", file=sys.stderr) + print(" or:", file=sys.stderr) + print( + f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', + file=sys.stderr, + ) + print( + f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', + file=sys.stderr, + ) + raise SystemExit(2) + + +def check_not_inside_source_dir(): + """Prevent running ArchiveBox from inside its source directory (would pollute repo with data files).""" + cwd = Path(os.getcwd()).resolve() + is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists() + data_dir_set_elsewhere = os.environ.get("DATA_DIR", "").strip() and Path(os.environ["DATA_DIR"]).resolve() != cwd + is_testing = "pytest" in sys.modules or "unittest" in sys.modules + + if is_source_dir and not data_dir_set_elsewhere and not is_testing: + raise SystemExit("[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first") + + +def check_data_dir_permissions(): + from archivebox import DATA_DIR + from archivebox.misc.logging import STDERR + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER + from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + data_dir_stat = Path(DATA_DIR).stat() + data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid + data_owned_by_root = data_dir_uid == 0 + + # data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID + data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False + data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK)) + if data_owned_by_root: + STDERR.print( + "\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]", + ) + elif data_owner_doesnt_match or data_not_writable: + STDERR.print( + f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]", + ) + + if data_owned_by_root or data_owner_doesnt_match or data_not_writable: + STDERR.print( + f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:", + ) + STDERR.print(f" [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}") + STDERR.print() + STDERR.print("[blue]More info:[/blue]") + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]", + ) + + from archivebox.config.common import STORAGE_CONFIG + + try: + tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) or STORAGE_CONFIG.TMP_DIR + except Exception: + tmp_dir = STORAGE_CONFIG.TMP_DIR + + try: + lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) or STORAGE_CONFIG.LIB_DIR + except Exception: + lib_dir = STORAGE_CONFIG.LIB_DIR + + # Check /tmp dir permissions + check_tmp_dir(tmp_dir, throw=False, must_exist=True) + + # Check /lib dir permissions + check_lib_dir(lib_dir, throw=False, must_exist=True) + + os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) + + +def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): + from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir + from archivebox.misc.logging import STDERR + from archivebox.misc.logging_util import pretty_path + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.common import STORAGE_CONFIG + + tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR + socket_file = tmp_dir.absolute().resolve() / "supervisord.sock" + + if not must_exist and not os.path.isdir(tmp_dir): + # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable) + return len(f"file://{socket_file}") <= 96 + + tmp_is_valid = False + allow_no_unix_sockets = os.environ.get("ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS", "").lower() in ("1", "true", "yes") + try: + tmp_is_valid = dir_is_writable(tmp_dir) + if not allow_no_unix_sockets: + tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) + assert tmp_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}" + assert len(f"file://{socket_file}") <= 96, ( + f"ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars." + ) + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.", + " - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).", + " - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.", + " - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured TMP_DIR[/red]", + subtitle="Background workers may fail to start until fixed.", + ), + ) + STDERR.print() + if throw: + raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e + return False + + +def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True): + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.misc.logging import STDERR + from archivebox.misc.logging_util import pretty_path + from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir + from archivebox.config.common import STORAGE_CONFIG + + lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR + + # assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config" + + if not must_exist and not os.path.isdir(lib_dir): + return True + + lib_is_valid = False + try: + lib_is_valid = dir_is_writable(lib_dir) + assert lib_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}" + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).", + " - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).", + " - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or '/usr/local/share/archivebox'}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured LIB_DIR[/red]", + subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]", + ), + ) + STDERR.print() + if throw: + raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e + return False diff --git a/archivebox/misc/db.py b/archivebox/misc/db.py new file mode 100644 index 0000000000..d9e66f3f97 --- /dev/null +++ b/archivebox/misc/db.py @@ -0,0 +1,53 @@ +""" +Database utility functions for ArchiveBox. +""" + +__package__ = "archivebox.misc" + +from io import StringIO +from pathlib import Path +from typing import Any + +from archivebox.config import DATA_DIR +from archivebox.misc.util import enforce_types + + +@enforce_types +def list_migrations(out_dir: Path = DATA_DIR) -> list[tuple[bool, str]]: + """List all Django migrations and their status""" + from django.core.management import call_command + + out = StringIO() + call_command("showmigrations", list=True, stdout=out) + out.seek(0) + + migrations = [] + for line in out.readlines(): + if line.strip() and "]" in line: + status_str, name_str = line.strip().split("]", 1) + is_applied = "X" in status_str + migration_name = name_str.strip() + migrations.append((is_applied, migration_name)) + + return migrations + + +@enforce_types +def apply_migrations(out_dir: Path = DATA_DIR) -> list[str]: + """Apply pending Django migrations""" + from django.core.management import call_command + + out1 = StringIO() + + call_command("migrate", interactive=False, database="default", stdout=out1) + out1.seek(0) + + return [line.strip() for line in out1.readlines() if line.strip()] + + +@enforce_types +def get_admins(out_dir: Path = DATA_DIR) -> list[Any]: + """Get list of superuser accounts""" + from django.contrib.auth.models import User + + return list(User.objects.filter(is_superuser=True).exclude(username="system")) diff --git a/archivebox/misc/debugging.py b/archivebox/misc/debugging.py new file mode 100644 index 0000000000..4ada510c37 --- /dev/null +++ b/archivebox/misc/debugging.py @@ -0,0 +1,33 @@ +from functools import wraps +from time import time + + +def timed_function(func): + """ + Very simple profiling decorator for debugging. + Usage: + @timed_function + def my_func(): + ... + + More advanced alternatives: + - viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html + - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof + - Django Debug Toolbar + django-debug-toolbar-flamegraph + + Django Requests Tracker (requests-tracker) + """ + + @wraps(func) + def wrap(*args, **kwargs): + if args and hasattr(args[0], "__module__"): + module = args[0].__module__ + else: + module = func.__module__ + ts_start = time() + result = func(*args, **kwargs) + ts_end = time() + ms_elapsed = int((ts_end - ts_start) * 1000) + print(f"[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)") + return result + + return wrap diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py new file mode 100644 index 0000000000..2d9a55a5ea --- /dev/null +++ b/archivebox/misc/folders.py @@ -0,0 +1,50 @@ +""" +Folder utilities for ArchiveBox. + +Note: This file only contains legacy cleanup utilities. +The DB is the single source of truth - use Snapshot.objects queries for all status checks. +""" + +__package__ = "archivebox.misc" + +import os +import json +import shutil +from pathlib import Path + +from archivebox.config import DATA_DIR, CONSTANTS +from archivebox.misc.util import enforce_types + + +@enforce_types +def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> tuple[list[str], list[str]]: + """ + Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json. + + This is only used during 'archivebox init' for one-time cleanup of misnamed directories. + After this runs once, 'archivebox update' handles all filesystem operations. + """ + fixed = [] + cant_fix = [] + for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME): + if entry.is_dir(follow_symlinks=True): + index_path = Path(entry.path) / "index.json" + if index_path.exists(): + try: + with open(index_path) as f: + data = json.load(f) + timestamp = data.get("timestamp") + except Exception: + continue + + if not timestamp: + continue + + if not entry.path.endswith(f"/{timestamp}"): + dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp + if dest.exists(): + cant_fix.append(entry.path) + else: + shutil.move(entry.path, str(dest)) + fixed.append(str(dest)) + return fixed, cant_fix diff --git a/archivebox/misc/hashing.py b/archivebox/misc/hashing.py new file mode 100644 index 0000000000..f671195ea8 --- /dev/null +++ b/archivebox/misc/hashing.py @@ -0,0 +1,257 @@ +import hashlib +import mimetypes +from functools import lru_cache +from pathlib import Path +from collections.abc import Callable +from datetime import datetime + + +@lru_cache(maxsize=1024) +def _cached_file_hash(filepath: str, size: int, mtime: float) -> str: + """Internal function to calculate file hash with cache key based on path, size and mtime.""" + sha256_hash = hashlib.sha256() + + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256_hash.update(chunk) + + return sha256_hash.hexdigest() + + +@lru_cache(maxsize=10) +def hash_file(file_path: Path, pwd: Path | None = None) -> str: + """Calculate SHA256 hash of a file with caching based on path, size and mtime.""" + pwd = Path(pwd) if pwd else None + file_path = Path(file_path) + if not file_path.is_absolute(): + file_path = pwd / file_path if pwd else file_path.absolute() + + abs_path = file_path.resolve() + stat_info = abs_path.stat() + + return _cached_file_hash( + str(abs_path), + stat_info.st_size, + stat_info.st_mtime, + ) + + +@lru_cache(maxsize=10) +def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]: + """Calculate SHA256 hashes for all files and directories recursively.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + if not dir_path.is_dir(): + raise ValueError(f"Not a directory: {dir_path}") + if max_depth < -1: + raise ValueError(f"max_depth must be >= -1, got {max_depth}") + + # Get all files recursively + all_files = get_dir_entries( + dir_path, + pwd=pwd, + recursive=True, + include_files=True, + include_dirs=False, + filter_func=filter_func, + ) + + hashes: dict[str, str] = {} + hashable_summary = [] + + # Calculate hashes for all files + for subfile in all_files: + subfile_path = dir_path / subfile + sha256_hash = hash_file(subfile_path) + hashes[subfile] = sha256_hash + hashable_summary.append(f"{sha256_hash} ./{subfile}") + + # Calculate hashes for all directories + subdirs = get_dir_entries( + dir_path, + pwd=pwd, + recursive=True, + include_files=False, + include_dirs=True, + include_hidden=False, + filter_func=filter_func, + max_depth=max_depth, + ) + + for subdir in subdirs: + subdir_path = dir_path / subdir + subdir_hashes = get_dir_hashes( + subdir_path, + filter_func=filter_func, + max_depth=0, + ) + hashes[subdir] = subdir_hashes["."] + + # Filter results by max_depth + if max_depth >= 0: + hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1} + + # Calculate root directory hash + hashable_summary.sort() + root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest() + hashes["."] = root_sha256 + + return hashes + + +@lru_cache(maxsize=128) +def get_dir_entries( + dir_path: Path, + pwd: Path | None = None, + recursive: bool = True, + include_files: bool = True, + include_dirs: bool = True, + include_hidden: bool = False, + filter_func: Callable | None = None, + max_depth: int = -1, +) -> tuple[str, ...]: + """Get filtered list of directory entries.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + results = [] + + def process_path(path: Path, depth: int): + if not include_hidden and path.name.startswith("."): + return False + if max_depth >= 0 and depth > max_depth: + return False + if filter_func: + info = { + "abspath": str(path.absolute()), + "relpath": str(path.relative_to(dir_path)), + } + if not filter_func(info): + return False + return True + + for path in dir_path.rglob("*") if recursive else dir_path.glob("*"): + current_depth = len(path.relative_to(dir_path).parts) + + if path.is_file() and include_files and process_path(path, current_depth): + results.append(str(path.relative_to(dir_path))) + elif path.is_dir() and include_dirs and process_path(path, current_depth): + results.append(str(path.relative_to(dir_path))) + + if not recursive: + break + + return tuple(sorted(results)) # Make immutable for caching + + +@lru_cache(maxsize=1024) +def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]: + """Calculate sizes for all files and directories recursively.""" + sizes: dict[str, int] = {} + hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs) + dir_path = Path(dir_path) + + for path_key in hashes: + full_path = dir_path / path_key + if full_path.is_file(): + sizes[path_key] = full_path.stat().st_size + else: + total = 0 + for file_path in full_path.rglob("*"): + if file_path.is_file() and not file_path.name.startswith("."): + total += file_path.stat().st_size + sizes[path_key + "/"] = total + + return sizes + + +@lru_cache(maxsize=10) +def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict: + """Get detailed information about directory contents including hashes and sizes.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth) + sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth) + + num_total_subpaths = sum(1 for name in hashes if name != ".") + details = {} + + for filename, sha256_hash in sorted(hashes.items()): + abs_path = (dir_path / filename).resolve() + stat_info = abs_path.stat() + num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/")) + is_dir = abs_path.is_dir() + if is_dir: + mime_type = "inode/directory" + basename = abs_path.name + extension = "" + num_bytes = sizes[filename + "/"] + if filename == ".": + num_subpaths = num_total_subpaths + else: + filename += "/" + num_subpaths = num_subpaths + else: # is_file + num_subpaths = None + mime_type = mimetypes.guess_type(str(abs_path))[0] + extension = abs_path.suffix + basename = abs_path.name.rsplit(extension, 1)[0] + num_bytes = sizes[filename] + + details[filename] = { + "basename": basename, + "mime_type": mime_type, + "extension": extension, + "num_subpaths": num_subpaths, + "num_bytes": num_bytes, + "hash_sha256": sha256_hash, + "created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(), + "modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(), + } + + if filter_func and not filter_func(details[filename]): + del details[filename] + + return details + + +if __name__ == "__main__": + import json + + dir_info = get_dir_info(Path("."), max_depth=6) + with open(".hashes.json", "w") as f: + json.dump(dir_info, f, indent=4) + print("Wrote .hashes.json") + +# Example output: +# { +# ".": { +# "basename": "misc", +# "mime_type": "inode/directory", +# "extension": "", +# "num_subpaths": 25, +# "num_bytes": 214677, +# "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530", +# "created_at": "2024-12-04T00:08:38.537449", +# "modified_at": "2024-12-04T00:08:38.537449" +# }, +# "__init__.py": { +# "basename": "__init__", +# "mime_type": "text/x-python", +# "extension": ".py", +# "num_subpaths": null, +# "num_bytes": 32, +# "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551", +# "created_at": "2024-10-08T00:51:41.001359", +# "modified_at": "2024-10-08T00:51:41.001359" +# }, +# ... +# } diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py new file mode 100644 index 0000000000..e232277415 --- /dev/null +++ b/archivebox/misc/jsonl.py @@ -0,0 +1,178 @@ +""" +JSONL (JSON Lines) utilities for ArchiveBox. + +Provides functions for reading, writing, and processing typed JSONL records. +All CLI commands that accept stdin can read both plain URLs and typed JSONL. + +CLI Pipeline: + archivebox crawl URL -> {"type": "Crawl", "id": "...", "urls": "...", ...} + archivebox snapshot -> {"type": "Snapshot", "id": "...", "url": "...", ...} + archivebox extract -> {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", ...} + +Typed JSONL Format: + {"type": "Crawl", "id": "...", "urls": "...", "max_depth": 0, ...} + {"type": "Snapshot", "id": "...", "url": "https://example.com", "title": "...", ...} + {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", ...} + {"type": "Tag", "name": "..."} + +Plain URLs (also supported): + https://example.com + https://foo.com +""" + +__package__ = "archivebox.misc" + +import sys +import json +import select +from typing import Any, TextIO +from collections.abc import Iterable, Iterator +from pathlib import Path + + +# Type constants for JSONL records +TYPE_SNAPSHOT = "Snapshot" +TYPE_ARCHIVERESULT = "ArchiveResult" +TYPE_TAG = "Tag" +TYPE_CRAWL = "Crawl" +TYPE_BINARYREQUEST = "BinaryRequest" +TYPE_BINARY = "Binary" +TYPE_PROCESS = "Process" +TYPE_MACHINE = "Machine" + +VALID_TYPES = { + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_TAG, + TYPE_CRAWL, + TYPE_BINARYREQUEST, + TYPE_BINARY, + TYPE_PROCESS, + TYPE_MACHINE, +} + + +def parse_line(line: str) -> dict[str, Any] | None: + """ + Parse a single line of input as either JSONL or plain URL. + + Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid. + """ + line = line.strip() + if not line or line.startswith("#"): + return None + + # Try to parse as JSON first + if line.startswith("{"): + try: + record = json.loads(line) + # If it has a type, validate it + if "type" in record and record["type"] not in VALID_TYPES: + # Unknown type, treat as raw data + pass + # If it has url but no type, assume Snapshot + if "url" in record and "type" not in record: + record["type"] = TYPE_SNAPSHOT + return record + except json.JSONDecodeError: + pass + + # Treat as plain URL if it looks like one + if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"): + return {"type": TYPE_SNAPSHOT, "url": line} + + # Could be a snapshot ID (UUID with dashes or compact 32-char hex) + if len(line) == 36 and line.count("-") == 4: + return {"type": TYPE_SNAPSHOT, "id": line} + if len(line) == 32: + try: + int(line, 16) + except ValueError: + pass + else: + return {"type": TYPE_SNAPSHOT, "id": line} + + # Unknown format, skip + return None + + +def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]: + """ + Read JSONL or plain URLs from stdin. + + Yields parsed records as dicts. + Supports both JSONL format and plain URLs (one per line). + """ + active_stream: TextIO = sys.stdin if stream is None else stream + + # Don't block if stdin is a tty with no input + if active_stream.isatty(): + return + + try: + ready, _, _ = select.select([active_stream], [], [], 0) + except (OSError, ValueError): + ready = [active_stream] + + if not ready: + return + + for line in active_stream: + record = parse_line(line) + if record: + yield record + + +def read_file(path: Path) -> Iterator[dict[str, Any]]: + """ + Read JSONL or plain URLs from a file. + + Yields parsed records as dicts. + """ + with open(path) as f: + for line in f: + record = parse_line(line) + if record: + yield record + + +def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]: + """ + Read from CLI arguments if provided, otherwise from stdin. + + Handles both URLs and JSONL from either source. + """ + if args: + for arg in args: + # Check if it's a file path + path = Path(arg) + if path.exists() and path.is_file(): + yield from read_file(path) + else: + record = parse_line(arg) + if record: + yield record + else: + yield from read_stdin(stream) + + +def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None: + """ + Write a single JSONL record to stdout (or provided stream). + """ + active_stream: TextIO = sys.stdout if stream is None else stream + active_stream.write(json.dumps(record) + "\n") + active_stream.flush() + + +def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int: + """ + Write multiple JSONL records to stdout (or provided stream). + + Returns count of records written. + """ + count = 0 + for record in records: + write_record(record, stream) + count += 1 + return count diff --git a/archivebox/misc/legacy.py b/archivebox/misc/legacy.py new file mode 100644 index 0000000000..d4a62b0515 --- /dev/null +++ b/archivebox/misc/legacy.py @@ -0,0 +1,110 @@ +""" +Legacy archive import utilities. + +These functions are used to import data from old ArchiveBox archive formats +(JSON indexes, archive directory structures) into the new database. + +This is separate from the hooks-based parser system which handles importing +new URLs from bookmark files, RSS feeds, etc. +""" + +__package__ = "archivebox.misc" + +import os +import json +from pathlib import Path +from datetime import datetime, timezone +from typing import TypedDict +from collections.abc import Iterator + + +class SnapshotDict(TypedDict, total=False): + """ + Dictionary type representing a snapshot/link, compatible with Snapshot model fields. + """ + + url: str # Required: the URL to archive + timestamp: str # Optional: unix timestamp string + title: str # Optional: page title + tags: str # Optional: comma-separated tags string + sources: list[str] # Optional: list of source file paths + + +def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]: + """ + Parse links from the main JSON index file (archive/index.json). + + This is used to recover links from old archive formats. + """ + from archivebox.config import CONSTANTS + + index_path = out_dir / CONSTANTS.JSON_INDEX_FILENAME + if not index_path.exists(): + return + + try: + with open(index_path, encoding="utf-8") as f: + data = json.load(f) + + links = data.get("links", []) + for link in links: + yield { + "url": link.get("url", ""), + "timestamp": link.get("timestamp", str(datetime.now(timezone.utc).timestamp())), + "title": link.get("title"), + "tags": link.get("tags", ""), + } + except (json.JSONDecodeError, KeyError, TypeError): + return + + +def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]: + """ + Parse links from individual snapshot index.jsonl/index.json files in archive directories. + + Walks through archive/*/index.jsonl and archive/*/index.json files to discover orphaned snapshots. + Prefers index.jsonl (new format) over index.json (legacy format). + """ + from archivebox.config import CONSTANTS + + archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME + if not archive_dir.exists(): + return + + for entry in os.scandir(archive_dir): + if not entry.is_dir(): + continue + + # Try index.jsonl first (new format) + jsonl_file = Path(entry.path) / CONSTANTS.JSONL_INDEX_FILENAME + json_file = Path(entry.path) / CONSTANTS.JSON_INDEX_FILENAME + + link = None + + if jsonl_file.exists(): + try: + with open(jsonl_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line.startswith("{"): + record = json.loads(line) + if record.get("type") == "Snapshot": + link = record + break + except (json.JSONDecodeError, KeyError, TypeError): + pass + + if link is None and json_file.exists(): + try: + with open(json_file, encoding="utf-8") as f: + link = json.load(f) + except (json.JSONDecodeError, KeyError, TypeError): + pass + + if link: + yield { + "url": link.get("url", ""), + "timestamp": link.get("timestamp", entry.name), + "title": link.get("title"), + "tags": link.get("tags", ""), + } diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py new file mode 100644 index 0000000000..61affd0e17 --- /dev/null +++ b/archivebox/misc/logging.py @@ -0,0 +1,93 @@ +__package__ = "archivebox.misc" + +# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers) +# Higher-level logging functions are in logging_util.py + +import sys +from collections import defaultdict +from random import randint + +from benedict import benedict +from rich.console import Console +from rich.highlighter import Highlighter + +# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS +# Disable wrapping - use soft_wrap=True and large width so text flows naturally +# Colors are preserved, just no hard line breaks inserted +CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True) +STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True) +IS_TTY = sys.stdout.isatty() + + +class RainbowHighlighter(Highlighter): + def highlight(self, text): + for index in range(len(text)): + text.stylize(f"color({randint(90, 98)})", index, index + 1) + + +rainbow = RainbowHighlighter() + + +DEFAULT_CLI_COLORS = benedict( + { + "reset": "\033[00;00m", + "lightblue": "\033[01;30m", + "lightyellow": "\033[01;33m", + "lightred": "\033[01;35m", + "red": "\033[01;31m", + "green": "\033[01;32m", + "blue": "\033[01;34m", + "white": "\033[01;37m", + "black": "\033[01;30m", + }, +) +ANSI = benedict({k: "" for k in DEFAULT_CLI_COLORS.keys()}) + +COLOR_DICT = defaultdict( + lambda: [(0, 0, 0), (0, 0, 0)], + { + "00": [(0, 0, 0), (0, 0, 0)], + "30": [(0, 0, 0), (0, 0, 0)], + "31": [(255, 0, 0), (128, 0, 0)], + "32": [(0, 200, 0), (0, 128, 0)], + "33": [(255, 255, 0), (128, 128, 0)], + "34": [(0, 0, 255), (0, 0, 128)], + "35": [(255, 0, 255), (128, 0, 128)], + "36": [(0, 255, 255), (0, 128, 128)], + "37": [(255, 255, 255), (255, 255, 255)], + }, +) + + +# Logging Helpers (DEPRECATED, use rich.print instead going forward) +def stdout(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if color: + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] + else: + strs = [" ".join(str(a) for a in args), "\n"] + + sys.stdout.write(prefix + "".join(strs)) + + +def stderr(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if color: + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] + else: + strs = [" ".join(str(a) for a in args), "\n"] + + sys.stderr.write(prefix + "".join(strs)) + + +def hint(text: tuple[str, ...] | list[str] | str, prefix=" ", config: benedict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if isinstance(text, str): + stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}") + else: + stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}") + for line in text[1:]: + stderr(f"{prefix} {line}") diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py new file mode 100644 index 0000000000..c5458eebf4 --- /dev/null +++ b/archivebox/misc/logging_util.py @@ -0,0 +1,755 @@ +__package__ = "archivebox" + +# High-level logging functions for CLI output and progress tracking +# Low-level primitives (Rich console, ANSI colors) are in logging.py + +import re +import os +import sys +import time + +from math import log +from multiprocessing import Process +from pathlib import Path + +from datetime import datetime, timezone +from dataclasses import dataclass +from typing import Any, Optional, IO, TYPE_CHECKING, cast +from collections.abc import Iterable + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + +from rich import print +from rich.panel import Panel + +from archivebox.config import CONSTANTS, DATA_DIR, VERSION +from archivebox.config.common import SHELL_CONFIG +from archivebox.misc.system import get_dir_size +from archivebox.misc.util import enforce_types +from archivebox.misc.logging import ANSI + + +@dataclass +class RuntimeStats: + """mutable stats counter for logging archiving timing info to CLI output""" + + skipped: int = 0 + succeeded: int = 0 + failed: int = 0 + + parse_start_ts: datetime | None = None + parse_end_ts: datetime | None = None + + index_start_ts: datetime | None = None + index_end_ts: datetime | None = None + + archiving_start_ts: datetime | None = None + archiving_end_ts: datetime | None = None + + +# globals are bad, mmkay +_LAST_RUN_STATS = RuntimeStats() + + +class TimedProgress: + """Show a progress bar and measure elapsed time until .end() is called""" + + def __init__(self, seconds, prefix=""): + + self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS + self.ANSI = SHELL_CONFIG.ANSI + + if self.SHOW_PROGRESS: + self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI)) + self.p.start() + + self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None} + + def end(self): + """immediately end progress, clear the progressbar line, and save end_ts""" + + end_ts = datetime.now(timezone.utc) + self.stats["end_ts"] = end_ts + + if self.SHOW_PROGRESS: + # terminate if we havent already terminated + try: + # kill the progress bar subprocess + try: + self.p.close() # must be closed *before* its terminnated + except (KeyboardInterrupt, SystemExit): + print() + raise + except BaseException: # lgtm [py/catch-base-exception] + pass + self.p.terminate() + time.sleep(0.1) + # sometimes the timer doesn't terminate properly, then blocks at the join until + # the full time has elapsed. sending a kill tries to avoid that. + try: + self.p.kill() + except Exception: + pass + + # clear whole terminal line + try: + sys.stdout.write("\r{}{}\r".format((" " * SHELL_CONFIG.TERM_WIDTH), self.ANSI["reset"])) + except (OSError, BrokenPipeError): + # ignore when the parent proc has stopped listening to our stdout + pass + except ValueError: + pass + + +@enforce_types +def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI) -> None: + """show timer in the form of progress bar, with percentage and seconds remaining""" + output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__ + chunk = "█" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#" + last_width = SHELL_CONFIG.TERM_WIDTH + chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + try: + for s in range(seconds * chunks): + max_width = SHELL_CONFIG.TERM_WIDTH + if max_width < last_width: + # when the terminal size is shrunk, we have to write a newline + # otherwise the progress bar will keep wrapping incorrectly + sys.stdout.write("\r\n") + sys.stdout.flush() + chunks = max_width - len(prefix) - 20 + pct_complete = s / chunks / seconds * 100 + log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;) + bar_width = round(log_pct / (100 / chunks)) + last_width = max_width + + # ████████████████████ 0.9% (1/60sec) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["green" if pct_complete < 80 else "lightyellow"], + (chunk * bar_width).ljust(chunks), + ANSI["reset"], + round(pct_complete, 1), + round(s / chunks), + seconds, + ), + ) + sys.stdout.flush() + time.sleep(1 / chunks) + + # ██████████████████████████████████ 100.0% (60/60sec) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["red"], + chunk * chunks, + ANSI["reset"], + 100.0, + seconds, + seconds, + ), + ) + sys.stdout.flush() + # uncomment to have it disappear when it hits 100% instead of staying full red: + # time.sleep(0.5) + # sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), ANSI['reset'])) + # sys.stdout.flush() + except (KeyboardInterrupt, BrokenPipeError): + print() + + +def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."): + args = " ".join(subcommand_args) + version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format( + now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + VERSION=VERSION, + subcommand=subcommand, + args=args, + ) + # stderr() + # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI)) + # stderr() + print(Panel(version_msg), file=sys.stderr) + + +### Parsing Stage + + +def log_importing_started(urls: str | list[str], depth: int, index_only: bool): + _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc) + print( + "[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]".format( + _LAST_RUN_STATS.parse_start_ts.strftime("%Y-%m-%d %H:%M:%S"), + len(urls) if isinstance(urls, list) else len(urls.split("\n")), + depth, + " (index only)" if index_only else "", + ), + ) + + +def log_source_saved(source_file: str): + print(" > Saved verbatim input to {}/{}".format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit("/", 1)[-1])) + + +def log_parsing_finished(num_parsed: int, parser_name: str): + _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc) + print(f" > Parsed {num_parsed} URLs from input ({parser_name})") + + +def log_deduping_finished(num_new_links: int): + print(f" > Found {num_new_links} new URLs not already in index") + + +def log_crawl_started(new_links): + print() + print(f"[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]") + + +### Indexing Stage + + +def log_indexing_process_started(num_links: int): + start_ts = datetime.now(timezone.utc) + _LAST_RUN_STATS.index_start_ts = start_ts + print() + print( + "[bright_black][*] [{}] Writing {} links to main index...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + ), + ) + + +def log_indexing_process_finished(): + end_ts = datetime.now(timezone.utc) + _LAST_RUN_STATS.index_end_ts = end_ts + + +def log_indexing_started(out_path: str): + if SHELL_CONFIG.IS_TTY: + sys.stdout.write(f" > ./{Path(out_path).relative_to(DATA_DIR)}") + + +def log_indexing_finished(out_path: str): + print(f"\r √ ./{Path(out_path).relative_to(DATA_DIR)}") + + +### Archiving Stage + + +def log_archiving_started(num_links: int, resume: float | None = None): + + start_ts = datetime.now(timezone.utc) + _LAST_RUN_STATS.archiving_start_ts = start_ts + print() + if resume: + print( + "[green][â–ļ] [{}] Resuming archive updating for {} pages starting from {}...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + resume, + ), + ) + else: + print( + "[green][â–ļ] [{}] Starting archiving of {} snapshots in index...[/]".format( + start_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + ), + ) + + +def log_archiving_paused(num_links: int, idx: int, timestamp: str): + + end_ts = datetime.now(timezone.utc) + _LAST_RUN_STATS.archiving_end_ts = end_ts + print() + print( + "\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]".format( + now=end_ts.strftime("%Y-%m-%d %H:%M:%S"), + idx=idx + 1, + timestamp=timestamp, + total=num_links, + ), + ) + print() + print(" Continue archiving where you left off by running:") + print(f" archivebox update --resume={timestamp}") + + +def log_archiving_finished(num_links: int): + + from archivebox.core.models import Snapshot + + end_ts = datetime.now(timezone.utc) + _LAST_RUN_STATS.archiving_end_ts = end_ts + assert _LAST_RUN_STATS.archiving_start_ts is not None + seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp() + if seconds > 60: + duration = f"{seconds / 60:.2f} min" + else: + duration = f"{seconds:.2f} sec" + + print() + print( + "[green][√] [{}] Update of {} pages complete ({})[/]".format( + end_ts.strftime("%Y-%m-%d %H:%M:%S"), + num_links, + duration, + ), + ) + print(f" - {_LAST_RUN_STATS.skipped} links skipped") + print(f" - {_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed} links updated") + print(f" - {_LAST_RUN_STATS.failed} links had errors") + + if Snapshot.objects.count() < 50: + print() + print(" [violet]Hint:[/] To manage your archive in a Web UI, run:") + print(" archivebox server 0.0.0.0:8000") + + +def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool): + + # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" + # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ + # > output/archive/1478739709 + + print( + '\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format( + symbol_color="green" if is_new else "bright_black", + symbol="+" if is_new else "√", + now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + title=snapshot.title or snapshot.base_url, + ), + ) + print(f" [sky_blue1]{snapshot.url}[/]") + print( + " {} {}".format( + ">" if is_new else "√", + pretty_path(out_dir), + ), + ) + + +def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime): + total = sum(stats.values()) + + if stats["failed"] > 0: + _LAST_RUN_STATS.failed += 1 + elif stats["skipped"] == total: + _LAST_RUN_STATS.skipped += 1 + else: + _LAST_RUN_STATS.succeeded += 1 + + try: + results = snapshot.archiveresult_set.only("output_files", "output_size") + total_bytes = sum(result.output_size or result.output_size_from_files() for result in results) + total_files = sum(result.output_file_count() for result in results) + size = (total_bytes, 0, total_files) + except Exception: + try: + size = get_dir_size(out_dir) + except FileNotFoundError: + size = (0, None, "0") + + end_ts = datetime.now(timezone.utc) + duration = str(end_ts - start_ts).split(".")[0] + print(f" [bright_black]{size[2]} files ({printable_filesize(size[0])}) in {duration}s [/]") + + +def log_archive_method_started(method: str): + print(f" > {method}") + + +def log_archive_method_finished(result: dict): + """ + quote the argument with whitespace in a command so the user can + copy-paste the outputted string directly to run the cmd + """ + # Prettify CMD string and make it safe to copy-paste by quoting arguments + quoted_cmd = " ".join(f'"{arg}"' if (" " in arg) or (":" in arg) else arg for arg in result["cmd"]) + + if result["status"] == "failed": + output = result.get("output") + if output and output.__class__.__name__ == "TimeoutExpired": + duration = (result["end_ts"] - result["start_ts"]).seconds + hint_header = [ + f"[yellow3]Extractor timed out after {duration}s.[/]", + ] + else: + error_name = output.__class__.__name__.replace("ArchiveError", "") if output else "Error" + hint_header = [ + "[yellow3]Extractor failed:[/]", + f" {error_name} [red1]{output}[/]", + ] + + # Prettify error output hints string and limit to five lines + hints = getattr(output, "hints", None) or () if output else () + if hints: + if isinstance(hints, (list, tuple, type(_ for _ in ()))): + hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints] + else: + if isinstance(hints, bytes): + hints = hints.decode() + hints = hints.split("\n") + + hints = (f" [yellow1]{line.strip()}[/]" for line in list(hints)[:5] if line.strip()) + + docker_hints = () + if os.environ.get("IN_DOCKER") in ("1", "true", "True", "TRUE", "yes"): + docker_hints = (" docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash",) + + # Collect and prefix output lines with indentation + output_lines = [ + *hint_header, + *hints, + "[violet]Run to see full output:[/]", + *docker_hints, + *([" cd {};".format(result.get("pwd"))] if result.get("pwd") else []), + f" {quoted_cmd}", + ] + print( + "\n".join(f" {line}" for line in output_lines if line), + ) + print() + + +def log_list_started(filter_patterns: list[str] | None, filter_type: str): + print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]") + print(" {}".format(" ".join(filter_patterns or ()))) + + +def log_list_finished(snapshots): + from archivebox.core.models import Snapshot + + print() + print("---------------------------------------------------------------------------------------------------") + csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots])) + print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | ")) + print("---------------------------------------------------------------------------------------------------") + print() + + +def log_removal_started(snapshots, yes: bool, delete: bool): + count = snapshots.count() if hasattr(snapshots, "count") else len(snapshots) + print(f"[yellow3][i] Found {count} matching URLs to remove.[/]") + if delete: + file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)] + print( + f" {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n" + f" ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)", + ) + else: + print( + " Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n" + " (Pass --delete if you also want to permanently delete the data folders)", + ) + + if not yes: + print() + print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]") + try: + assert input(" y/[n]: ").lower() == "y" + except (KeyboardInterrupt, EOFError, AssertionError): + raise SystemExit(0) + + +def log_removal_finished(remaining_links: int, removed_links: int): + if remaining_links == 0 and removed_links == 0: + print() + print("[red1][X] No matching links found.[/]") + else: + total_before = remaining_links + removed_links + print() + print(f"[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]") + print(f" Index now contains {remaining_links} links.") + + +### Search Indexing Stage + + +def log_index_started(url: str): + print(f"[green][*] Indexing url: {url} in the search index[/]") + print() + + +### Helpers + + +@enforce_types +def pretty_path(path: Path | str, pwd: Path | str = DATA_DIR, color: bool = True) -> str: + """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" + pwd = str(Path(pwd)) # .resolve() + path = str(path) + + if not path: + return path + + # replace long absolute paths with ./ relative ones to save on terminal output width + if path.startswith(pwd) and (pwd != "/") and path != pwd: + if color: + path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1) + else: + path = path.replace(pwd, ".", 1) + + # quote paths containing spaces + if " " in path: + path = f'"{path}"' + + # replace home directory with ~ for shorter output + path = path.replace(str(Path("~").expanduser()), "~") + + return path + + +@enforce_types +def printable_filesize(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return f"{num_bytes:3.1f} {count}" + num_bytes /= 1024.0 + return "{:3.1f} {}".format(num_bytes, "TB") + + +@enforce_types +def format_duration(seconds: float) -> str: + """Format duration in human-readable form.""" + if seconds < 1: + return f"{seconds * 1000:.0f}ms" + elif seconds < 60: + return f"{seconds:.1f}s" + elif seconds < 3600: + minutes = int(seconds // 60) + secs = int(seconds % 60) + return f"{minutes}min {secs}s" if secs else f"{minutes}min" + else: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + return f"{hours}hr {minutes}min" if minutes else f"{hours}hr" + + +@enforce_types +def truncate_url(url: str, max_length: int = 60) -> str: + """Truncate URL to max_length, keeping domain and adding ellipsis.""" + if len(url) <= max_length: + return url + # Try to keep the domain and beginning of path + if "://" in url: + protocol, rest = url.split("://", 1) + if "/" in rest: + domain, path = rest.split("/", 1) + available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..." + if available > 10: + return f"{protocol}://{domain}/{path[:available]}..." + # Fallback: just truncate + return url[: max_length - 3] + "..." + + +@enforce_types +def log_worker_event( + worker_type: str, + event: str, + indent_level: int = 0, + pid: int | None = None, + worker_id: str | None = None, + url: str | None = None, + plugin: str | None = None, + metadata: dict[str, Any] | None = None, + error: Exception | None = None, +) -> None: + """ + Log a worker event with structured metadata and indentation. + + Args: + worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker) + event: Event name (Starting, Completed, Failed, etc.) + indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker) + pid: Process ID + worker_id: Worker ID (UUID for workers) + url: URL being processed (for SnapshotWorker) + plugin: Plugin name (for hook processes) + metadata: Dict of metadata to show in curly braces + error: Exception if event is an error + """ + indent = " " * indent_level + + from rich.markup import escape + + # Build worker identifier (without URL/plugin) + worker_parts = [worker_type] + # Don't add pid/worker_id for DB operations (they happen in whatever process is running) + if pid and worker_type != "DB": + worker_parts.append(f"pid={pid}") + if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB": + worker_parts.append(f"id={worker_id}") + + # Build worker label parts for brackets (shown inside brackets) + worker_label_base = worker_parts[0] + worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None + + # Build URL/plugin display (shown AFTER the label, outside brackets) + url_extractor_parts = [] + if url: + url_extractor_parts.append(f"url: {escape(url)}") + if plugin: + url_extractor_parts.append(f"extractor: {escape(plugin)}") + + url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else "" + + # Build metadata string + metadata_str = "" + if metadata: + # Format metadata nicely + meta_parts = [] + for k, v in metadata.items(): + if isinstance(v, float): + # Format floats nicely (durations, sizes) + if "duration" in k.lower(): + meta_parts.append(f"{k}: {format_duration(v)}") + elif "size" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(int(v))}") + else: + meta_parts.append(f"{k}: {v:.2f}") + elif isinstance(v, int): + # Format integers - check if it's a size + if "size" in k.lower() or "bytes" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(v)}") + else: + meta_parts.append(f"{k}: {v}") + elif isinstance(v, (list, tuple)): + meta_parts.append(f"{k}: {len(v)}") + else: + meta_parts.append(f"{k}: {v}") + metadata_str = " | ".join(meta_parts) + + # Determine color based on event + color = "white" + if event in ("Starting...", "Started", "STARTED", "Started in background"): + color = "green" + elif event.startswith("Created"): + color = "cyan" # DB creation events + elif event in ("Completed", "COMPLETED", "All work complete"): + color = "blue" + elif event in ("Failed", "ERROR", "Failed to spawn worker"): + color = "red" + elif event in ("Shutting down", "SHUTDOWN"): + color = "grey53" + + # Build final message + error_str = f" {type(error).__name__}: {error}" if error else "" + from archivebox.misc.logging import CONSOLE, STDERR + from rich.text import Text + + # Create a Rich Text object for proper formatting + # Text.append() treats content as literal (no markup parsing) + text = Text() + text.append(indent) + text.append(worker_label_base, style=color) + + # Add bracketed content if present (using Text.append to avoid markup issues) + if worker_bracket_content: + text.append("[", style=color) + text.append(worker_bracket_content, style=color) + text.append("]", style=color) + + text.append(f" {event}{error_str}", style=color) + + # Add URL/plugin info first (more important) + if url_extractor_str: + text.append(f" | {url_extractor_str}") + + # Then add other metadata + if metadata_str: + text.append(f" | {metadata_str}") + + # Stdout is reserved for JSONL records whenever commands are piped together. + # Route worker/DB progress to stderr in non-TTY contexts so pipelines like + # `archivebox snapshot list | archivebox run` keep stdout machine-readable. + output_console = CONSOLE if sys.stdout.isatty() else STDERR + output_console.print(text, soft_wrap=True) + + +@enforce_types +def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str: + return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items()) + + +@enforce_types +def printable_config(config: dict, prefix: str = "") -> str: + return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val))) + + +@enforce_types +def printable_folder_status(name: str, folder: dict) -> str: + if folder["enabled"]: + if folder["is_valid"]: + color, symbol, note, num_files = "green", "√", "valid", "" + else: + color, symbol, note, num_files = "red", "X", "invalid", "?" + else: + color, symbol, note, num_files = "grey53", "-", "unused", "-" + + if folder["path"]: + if os.access(folder["path"], os.R_OK): + try: + num_files = ( + f"{len(os.listdir(folder['path']))} files" + if os.path.isdir(folder["path"]) + else printable_filesize(Path(folder["path"]).stat().st_size) + ) + except PermissionError: + num_files = "error" + else: + num_files = "missing" + + if folder.get("is_mount"): + # add symbol @ next to filecount if path is a remote filesystem mount + num_files = f"{num_files} @" if num_files else "@" + + path = pretty_path(folder["path"]) + + return " ".join( + ( + f"[{color}]", + symbol, + "[/]", + name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"), + num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"), + f"[{color}]", + note.ljust(8), + "[/]", + path.ljust(76), + ), + ) + + +@enforce_types +def printable_dependency_version(name: str, dependency: dict) -> str: + color, symbol, note, version = "red", "X", "invalid", "?" + + if dependency["enabled"]: + if dependency["is_valid"]: + color, symbol, note = "green", "√", "valid" + + parsed_version_num = re.search(r"[\d\.]+", dependency["version"]) + if parsed_version_num: + version = f"v{parsed_version_num[0]}" + else: + color, symbol, note, version = "lightyellow", "-", "disabled", "-" + + path = pretty_path(dependency["path"]) + + return " ".join( + ( + ANSI[color], + symbol, + ANSI["reset"], + name.ljust(21), + version.ljust(14), + ANSI[color], + note.ljust(8), + ANSI["reset"], + path.ljust(76), + ), + ) diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py new file mode 100644 index 0000000000..2043038556 --- /dev/null +++ b/archivebox/misc/monkey_patches.py @@ -0,0 +1,72 @@ +__package__ = "archivebox" + + +import datetime +import warnings + +import benedict +from daphne import access +import django_stubs_ext +from django.utils import timezone + +django_stubs_ext.monkeypatch() + + +# monkey patch django timezone to add back utc (it was removed in Django 5.0) +setattr(timezone, "utc", datetime.UTC) + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI +# from signal_webhooks.apps import DjangoSignalWebhooksConfig +# DjangoSignalWebhooksConfig.verbose_name = 'API' + + +# Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files +# Standard Python tracebacks are used instead (full width, no frames) +# from rich.traceback import install +# install(show_locals=True, word_wrap=False, ...) + + +# Hide site-packages/sonic/client.py:115: SyntaxWarning +# https://github.com/xmonader/python-sonic-client/pull/18 +warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic") + + +# Make daphne log requests quieter and easier to read +class ModifiedAccessLogGenerator(access.AccessLogGenerator): + """Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files""" + + def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None): + + # Ignore noisy requests to staticfiles / favicons / etc. + if "GET /static/" in request: + return + if "GET /health/" in request: + return + if "GET /admin/jsi18n/" in request: + return + if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"): + return + if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"): + return + if str(status) in ("404", "304"): + return + + # clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats + self.stream.write( + "%s HTTP %s %s %s\n" + % ( + date.strftime("%Y-%m-%d %H:%M:%S"), + request, + status or "-", + "localhost" if host.startswith("127.") else host.split(":")[0], + ), + ) + + +access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore + + +# fix benedict objects to pretty-print/repr more nicely with rich +# https://stackoverflow.com/a/79048811/2156113 +# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol +benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore diff --git a/archivebox/misc/paginators.py b/archivebox/misc/paginators.py new file mode 100644 index 0000000000..86ca540b12 --- /dev/null +++ b/archivebox/misc/paginators.py @@ -0,0 +1,35 @@ +__package__ = "archivebox.misc" + +from django.core.paginator import Paginator +from django.utils.functional import cached_property + + +class AcceleratedPaginator(Paginator): + """ + Accelerated paginator ignores DISTINCT when counting total number of rows. + Speeds up SELECT Count(*) on Admin views by >20x. + https://hakibenita.com/optimizing-the-django-admin-paginator + """ + + @cached_property + def count(self): + has_filters = getattr(self.object_list, "_has_filters", None) + if callable(has_filters) and has_filters(): + # fallback to normal count method on filtered queryset + return super().count + + model = getattr(self.object_list, "model", None) + if model is None: + return super().count + + # otherwise count total rows in a separate fast query + return model.objects.count() + + # Alternative approach for PostgreSQL: fallback count takes > 200ms + # from django.db import connection, transaction, OperationalError + # with transaction.atomic(), connection.cursor() as cursor: + # cursor.execute('SET LOCAL statement_timeout TO 200;') + # try: + # return super().count + # except OperationalError: + # return 9999999999999 diff --git a/archivebox/misc/serve_static.py b/archivebox/misc/serve_static.py new file mode 100644 index 0000000000..84da77648e --- /dev/null +++ b/archivebox/misc/serve_static.py @@ -0,0 +1,954 @@ +import html +import json +import re +import os +import stat +import asyncio +import posixpath +import mimetypes +import importlib +import queue +import threading +import time +import zipfile +from datetime import datetime +from collections.abc import Callable +from pathlib import Path +from urllib.parse import urlencode + +from django.contrib.staticfiles import finders +from django.template import TemplateDoesNotExist, loader +from django.views import static +from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified +from django.utils._os import safe_join +from django.utils.http import http_date +from django.utils.translation import gettext as _ +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.logging_util import printable_filesize + + +_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {} + + +def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None: + hashes_path = snapshot_dir / "hashes" / "hashes.json" + if not hashes_path.exists(): + return None + try: + mtime = hashes_path.stat().st_mtime + except OSError: + return None + + cached = _HASHES_CACHE.get(hashes_path) + if cached and cached[0] == mtime: + return cached[1] + + try: + data = json.loads(hashes_path.read_text(encoding="utf-8")) + except Exception: + return None + + file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")} + _HASHES_CACHE[hashes_path] = (mtime, file_map) + return file_map + + +def _hash_for_path(document_root: Path, rel_path: str) -> str | None: + file_map = _load_hash_map(document_root) + if not file_map: + return None + return file_map.get(rel_path) + + +def _cache_policy() -> str: + return "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private" + + +def _format_direntry_timestamp(stat_result: os.stat_result) -> str: + timestamp = getattr(stat_result, "st_birthtime", None) or stat_result.st_mtime + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M") + + +def _safe_zip_stem(name: str) -> str: + safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-") + return safe_name or "archivebox" + + +class _StreamingQueueWriter: + """Expose a write-only file-like object so zipfile can stream into a queue.""" + + def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None: + self.output_queue = output_queue + self.position = 0 + + def write(self, data: bytes) -> int: + if data: + self.output_queue.put(data) + self.position += len(data) + return len(data) + + def tell(self) -> int: + return self.position + + def flush(self) -> None: + return None + + def close(self) -> None: + return None + + def writable(self) -> bool: + return True + + def seekable(self) -> bool: + return False + + +def _iter_visible_files(root: Path): + """Yield non-hidden files in a stable order so ZIP output is deterministic.""" + + for current_root, dirnames, filenames in os.walk(root): + dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith(".")) + for filename in sorted(name for name in filenames if not name.startswith(".")): + yield Path(current_root) / filename + + +def _build_directory_zip_response( + fullpath: Path, + path: str, + *, + is_archive_replay: bool, + use_async_stream: bool, +) -> StreamingHttpResponse: + root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox") + sentinel = object() + output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8) + initial_chunk_target = 64 * 1024 + initial_chunk_wait = 0.05 + + def build_zip() -> None: + # zipfile wants a write-only file object. Feed those bytes straight into + # a queue so the response can stream them out as soon as they are ready. + writer = _StreamingQueueWriter(output_queue) + try: + with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file: + for entry in _iter_visible_files(fullpath): + rel_parts = entry.relative_to(fullpath).parts + arcname = Path(root_name, *rel_parts).as_posix() + zip_file.write(entry, arcname) + except BaseException as err: + output_queue.put(err) + finally: + output_queue.put(sentinel) + + threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start() + + def iter_zip_chunks(): + # Emit a meaningful first chunk quickly so browsers show the download + # immediately instead of waiting on dozens of tiny ZIP header writes. + first_chunk = bytearray() + initial_deadline = time.monotonic() + initial_chunk_wait + + while True: + timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None + try: + chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get() + except queue.Empty: + if first_chunk: + yield bytes(first_chunk) + first_chunk.clear() + continue + chunk = output_queue.get() + + if chunk is sentinel: + if first_chunk: + yield bytes(first_chunk) + break + if isinstance(chunk, BaseException): + raise chunk + if len(first_chunk) < initial_chunk_target: + first_chunk.extend(chunk) + if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline: + yield bytes(first_chunk) + first_chunk.clear() + continue + yield chunk + + async def stream_zip_async(): + # Django ASGI buffers sync StreamingHttpResponse iterators by consuming + # them into a list. Drive the same sync iterator from a worker thread so + # Daphne can send each chunk as it arrives instead of buffering the ZIP. + iterator = iter(iter_zip_chunks()) + while True: + chunk = await asyncio.to_thread(next, iterator, None) + if chunk is None: + break + yield chunk + + response = StreamingHttpResponse( + stream_zip_async() if use_async_stream else iter_zip_chunks(), + content_type="application/zip", + ) + response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"' + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime) + response.headers["X-Accel-Buffering"] = "no" + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="application/zip", + is_archive_replay=is_archive_replay, + ) + + +def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse: + try: + template = loader.select_template( + [ + "static/directory_index.html", + "static/directory_index", + ], + ) + except TemplateDoesNotExist: + return static.directory_index(path, fullpath) + + entries = [] + file_list = [] + visible_entries = sorted( + (entry for entry in fullpath.iterdir() if not entry.name.startswith(".")), + key=lambda entry: (not entry.is_dir(), entry.name.lower()), + ) + for entry in visible_entries: + url = str(entry.relative_to(fullpath)) + if entry.is_dir(): + url += "/" + file_list.append(url) + + stat_result = entry.stat() + entries.append( + { + "name": url, + "url": url, + "is_dir": entry.is_dir(), + "size": "—" if entry.is_dir() else printable_filesize(stat_result.st_size), + "timestamp": _format_direntry_timestamp(stat_result), + }, + ) + + zip_query = request.GET.copy() + zip_query["download"] = "zip" + zip_url = request.path + if zip_query: + zip_url = f"{zip_url}?{zip_query.urlencode()}" + + context = { + "directory": f"{path}/", + "file_list": file_list, + "entries": entries, + "zip_url": zip_url, + } + return HttpResponse(template.render(context)) + + +# Ensure common web types are mapped consistently across platforms. +mimetypes.add_type("text/html", ".html") +mimetypes.add_type("text/html", ".htm") +mimetypes.add_type("text/css", ".css") +mimetypes.add_type("application/javascript", ".js") +mimetypes.add_type("application/json", ".json") +mimetypes.add_type("application/x-ndjson", ".jsonl") +mimetypes.add_type("text/markdown", ".md") +mimetypes.add_type("text/yaml", ".yml") +mimetypes.add_type("text/yaml", ".yaml") +mimetypes.add_type("text/csv", ".csv") +mimetypes.add_type("text/tab-separated-values", ".tsv") +mimetypes.add_type("application/xml", ".xml") +mimetypes.add_type("image/svg+xml", ".svg") + +try: + _markdown = getattr(importlib.import_module("markdown"), "markdown") +except ImportError: + _markdown: Callable[..., str] | None = None + +MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)") +MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") +MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*") +MARKDOWN_ITALIC_RE = re.compile(r"(?<!\*)\*([^*]+)\*(?!\*)") +HTML_TAG_RE = re.compile(r"<[A-Za-z][^>]*>") +HTML_BODY_RE = re.compile(r"<body[^>]*>(.*)</body>", flags=re.IGNORECASE | re.DOTALL) +RISKY_REPLAY_MIMETYPES = { + "text/html", + "application/xhtml+xml", + "image/svg+xml", +} +RISKY_REPLAY_EXTENSIONS = {".html", ".htm", ".xhtml", ".svg", ".svgz"} +RISKY_REPLAY_MARKERS = ( + "<!doctype html", + "<html", + "<svg", +) + + +def _extract_markdown_candidate(text: str) -> str: + candidate = text + body_match = HTML_BODY_RE.search(candidate) + if body_match: + candidate = body_match.group(1) + candidate = re.sub(r"^\s*<p[^>]*>", "", candidate, flags=re.IGNORECASE) + candidate = re.sub(r"</p>\s*$", "", candidate, flags=re.IGNORECASE) + return candidate.strip() + + +def _looks_like_markdown(text: str) -> bool: + lower = text.lower() + if "<html" in lower and "<head" in lower and "</body>" in lower: + return False + md_markers = 0 + md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE)) + md_markers += text.count("[TOC]") + md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text)) + md_markers += text.count("\n---") + text.count("\n***") + return md_markers >= 6 + + +def _render_text_preview_document(text: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_text = html.escape(text) + return f"""<!doctype html> +<html lang="en"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>{escaped_title} + + + +
{escaped_title}
+
{escaped_text}
+ +""" + + +def _render_image_preview_document(image_url: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_url = html.escape(image_url, quote=True) + return f""" + + + + + {escaped_title} + + + +
+ {escaped_title} +
+ +""" + + +def _render_markdown_fallback(text: str) -> str: + if _markdown is not None and not HTML_TAG_RE.search(text): + try: + return _markdown( + text, + extensions=["extra", "toc", "sane_lists"], + output_format="html", + ) + except Exception: + pass + + lines = text.splitlines() + headings = [] + + def slugify(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-") + return slug or "section" + + for raw_line in lines: + heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line) + if heading_match: + level = len(heading_match.group(1)) + content = heading_match.group(2).strip() + headings.append((level, content, slugify(content))) + + html_lines = [] + in_code = False + in_ul = False + in_ol = False + in_blockquote = False + + def render_inline(markup: str) -> str: + content = MARKDOWN_INLINE_IMAGE_RE.sub(r'\1', markup) + content = MARKDOWN_INLINE_LINK_RE.sub(r'\1', content) + content = MARKDOWN_BOLD_RE.sub(r"\1", content) + content = MARKDOWN_ITALIC_RE.sub(r"\1", content) + return content + + def close_lists(): + nonlocal in_ul, in_ol + if in_ul: + html_lines.append("") + in_ul = False + if in_ol: + html_lines.append("") + in_ol = False + + for raw_line in lines: + line = raw_line.rstrip("\n") + stripped = line.strip() + + if stripped.startswith("```"): + if in_code: + html_lines.append("
") + in_code = False + else: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + html_lines.append("
")
+                in_code = True
+            continue
+
+        if in_code:
+            html_lines.append(html.escape(line))
+            continue
+
+        if not stripped:
+            close_lists()
+            if in_blockquote:
+                html_lines.append("")
+                in_blockquote = False
+            html_lines.append("
") + continue + + heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line) + if heading_match: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + leading_tags = heading_match.group(1).strip() + level = len(heading_match.group(2)) + content = heading_match.group(3).strip() + if leading_tags: + html_lines.append(leading_tags) + html_lines.append(f'{render_inline(content)}') + continue + + if stripped in ("---", "***"): + close_lists() + html_lines.append("
") + continue + + if stripped.startswith("> "): + if not in_blockquote: + close_lists() + html_lines.append("
") + in_blockquote = True + content = stripped[2:] + html_lines.append(render_inline(content)) + continue + else: + if in_blockquote: + html_lines.append("
") + in_blockquote = False + + ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line) + if ul_match: + if in_ol: + html_lines.append("") + in_ol = False + if not in_ul: + html_lines.append("
    ") + in_ul = True + html_lines.append(f"
  • {render_inline(ul_match.group(1))}
  • ") + continue + + ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line) + if ol_match: + if in_ul: + html_lines.append("
") + in_ul = False + if not in_ol: + html_lines.append("
    ") + in_ol = True + html_lines.append(f"
  1. {render_inline(ol_match.group(1))}
  2. ") + continue + + close_lists() + + # Inline conversions (leave raw HTML intact) + if stripped == "[TOC]": + toc_items = [] + for level, title, slug in headings: + toc_items.append( + f'
  3. {title}
  4. ', + ) + html_lines.append( + '", + ) + continue + + html_lines.append(f"

    {render_inline(line)}

    ") + + close_lists() + if in_blockquote: + html_lines.append("") + if in_code: + html_lines.append("
") + + return "\n".join(html_lines) + + +def _render_markdown_document(markdown_text: str) -> str: + body = _render_markdown_fallback(markdown_text) + wrapped = ( + '' + '' + "" + "" + f"{body}" + "" + ) + return wrapped + + +def _content_type_base(content_type: str) -> str: + return (content_type or "").split(";", 1)[0].strip().lower() + + +def _is_risky_replay_document(fullpath: Path, content_type: str) -> bool: + if fullpath.suffix.lower() in RISKY_REPLAY_EXTENSIONS: + return True + + if _content_type_base(content_type) in RISKY_REPLAY_MIMETYPES: + return True + + # Unknown archived response paths often have no extension. Sniff a small prefix + # so one-domain no-JS mode still catches HTML/SVG documents. + try: + head = fullpath.read_bytes()[:4096].decode("utf-8", errors="ignore").lower() + except Exception: + return False + + return any(marker in head for marker in RISKY_REPLAY_MARKERS) + + +def _apply_archive_replay_headers(response: HttpResponse, *, fullpath: Path, content_type: str, is_archive_replay: bool) -> HttpResponse: + if not is_archive_replay: + return response + + response.headers.setdefault("X-Content-Type-Options", "nosniff") + response.headers.setdefault("X-ArchiveBox-Security-Mode", SERVER_CONFIG.SERVER_SECURITY_MODE) + + if SERVER_CONFIG.SHOULD_NEUTER_RISKY_REPLAY and _is_risky_replay_document(fullpath, content_type): + response.headers["Content-Security-Policy"] = ( + "sandbox; " + "default-src 'self' data: blob:; " + "script-src 'none'; " + "object-src 'none'; " + "base-uri 'none'; " + "form-action 'none'; " + "connect-src 'none'; " + "worker-src 'none'; " + "frame-ancestors 'self'; " + "style-src 'self' 'unsafe-inline' data: blob:; " + "img-src 'self' data: blob:; " + "media-src 'self' data: blob:; " + "font-src 'self' data: blob:;" + ) + response.headers.setdefault("Referrer-Policy", "no-referrer") + + return response + + +def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool = False): + """ + Overrides Django's built-in django.views.static.serve function to support byte range requests. + This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + assert document_root + path = posixpath.normpath(path).lstrip("/") + fullpath = Path(safe_join(document_root, path)) + if os.access(fullpath, os.R_OK) and fullpath.is_dir(): + if request.GET.get("download") == "zip" and show_indexes: + return _build_directory_zip_response( + fullpath, + path, + is_archive_replay=is_archive_replay, + use_async_stream=hasattr(request, "scope"), + ) + if show_indexes: + response = _render_directory_index(request, path, fullpath) + return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html", is_archive_replay=is_archive_replay) + raise Http404(_("Directory indexes are not allowed here.")) + if not os.access(fullpath, os.R_OK): + raise Http404(_("“%(path)s” does not exist") % {"path": fullpath}) + + statobj = fullpath.stat() + document_root = Path(document_root) if document_root else None + rel_path = path + etag = None + if document_root: + file_hash = _hash_for_path(document_root, rel_path) + if file_hash: + etag = f'"{file_hash}"' + + if etag: + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime) + return _apply_archive_replay_headers(not_modified, fullpath=fullpath, content_type="", is_archive_replay=is_archive_replay) + + content_type, encoding = mimetypes.guess_type(str(fullpath)) + content_type = content_type or "application/octet-stream" + # Add charset for text-like types (best guess), but don't override the type. + is_text_like = content_type.startswith("text/") or content_type in { + "application/json", + "application/javascript", + "application/xml", + "application/x-ndjson", + "image/svg+xml", + } + if is_text_like and "charset=" not in content_type: + content_type = f"{content_type}; charset=utf-8" + preview_as_text_html = ( + bool(request.GET.get("preview")) + and is_text_like + and not content_type.startswith("text/html") + and not content_type.startswith("image/svg+xml") + ) + preview_as_image_html = ( + bool(request.GET.get("preview")) and content_type.startswith("image/") and not content_type.startswith("image/svg+xml") + ) + + # Respect the If-Modified-Since header for non-markdown responses. + if not (content_type.startswith("text/plain") or content_type.startswith("text/html")): + if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime): + return _apply_archive_replay_headers( + HttpResponseNotModified(), + fullpath=fullpath, + content_type=content_type, + is_archive_replay=is_archive_replay, + ) + + # Wrap text-like outputs in HTML when explicitly requested for iframe previewing. + if preview_as_text_html: + try: + max_preview_size = 10 * 1024 * 1024 + if statobj.st_size <= max_preview_size: + decoded = fullpath.read_text(encoding="utf-8", errors="replace") + wrapped = _render_text_preview_document(decoded, fullpath.name) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + ) + except Exception: + pass + + if preview_as_image_html: + try: + preview_query = request.GET.copy() + preview_query.pop("preview", None) + raw_image_url = request.path + if preview_query: + raw_image_url = f"{raw_image_url}?{urlencode(list(preview_query.lists()), doseq=True)}" + wrapped = _render_image_preview_document(raw_image_url, fullpath.name) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + ) + except Exception: + pass + + # Heuristic fix: some archived HTML outputs (e.g. mercury content.html) + # are stored with HTML-escaped markup or markdown sources. If so, render sensibly. + if content_type.startswith("text/plain") or content_type.startswith("text/html"): + try: + max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use + if statobj.st_size <= max_unescape_size: + raw = fullpath.read_bytes() + decoded = raw.decode("utf-8", errors="replace") + escaped_count = decoded.count("<") + decoded.count(">") + tag_count = decoded.count("<") + if escaped_count and escaped_count > tag_count * 2: + decoded = html.unescape(decoded) + markdown_candidate = _extract_markdown_candidate(decoded) + if _looks_like_markdown(markdown_candidate): + wrapped = _render_markdown_document(markdown_candidate) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + ) + if escaped_count and escaped_count > tag_count * 2: + response = HttpResponse(decoded, content_type=content_type) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type=content_type, + is_archive_replay=is_archive_replay, + ) + except Exception: + pass + + # setup response object + ranged_file = RangedFileReader(open(fullpath, "rb")) + response = StreamingHttpResponse(ranged_file, content_type=content_type) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300" + if is_text_like: + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if content_type.startswith("image/"): + response.headers["Cache-Control"] = "public, max-age=604800, immutable" + + # handle byte-range requests by serving chunk of file + if stat.S_ISREG(statobj.st_mode): + size = statobj.st_size + response["Content-Length"] = size + response["Accept-Ranges"] = "bytes" + response["X-Django-Ranges-Supported"] = "1" + # Respect the Range header. + if "HTTP_RANGE" in request.META: + try: + ranges = parse_range_header(request.META["HTTP_RANGE"], size) + except ValueError: + ranges = None + # only handle syntactically valid headers, that are simple (no + # multipart byteranges) + if ranges is not None and len(ranges) == 1: + start, stop = ranges[0] + if stop > size: + # requested range not satisfiable + return HttpResponse(status=416) + ranged_file.start = start + ranged_file.stop = stop + response["Content-Range"] = "bytes %d-%d/%d" % (start, stop - 1, size) + response["Content-Length"] = stop - start + response.status_code = 206 + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay) + + +def serve_static(request, path, **kwargs): + """ + Serve static files below a given point in the directory structure or + from locations inferred from the staticfiles finders. + + To use, put a URL pattern such as:: + + from django.contrib.staticfiles import views + + path('', views.serve) + + in your URLconf. + + It uses the django.views.static.serve() view to serve the found files. + """ + + normalized_path = posixpath.normpath(path).lstrip("/") + absolute_path = finders.find(normalized_path) + if not absolute_path: + if path.endswith("/") or path == "": + raise Http404("Directory indexes are not allowed here.") + raise Http404("'%s' could not be found" % path) + document_root, path = os.path.split(absolute_path) + return serve_static_with_byterange_support(request, path, document_root=document_root, **kwargs) + + +def parse_range_header(header, resource_size): + """ + Parses a range header into a list of two-tuples (start, stop) where `start` + is the starting byte of the range (inclusive) and `stop` is the ending byte + position of the range (exclusive). + Returns None if the value of the header is not syntactically valid. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + if not header or "=" not in header: + return None + + ranges = [] + units, range_ = header.split("=", 1) + units = units.strip().lower() + + if units != "bytes": + return None + + for val in range_.split(","): + val = val.strip() + if "-" not in val: + return None + + if val.startswith("-"): + # suffix-byte-range-spec: this form specifies the last N bytes of an + # entity-body + start = resource_size + int(val) + if start < 0: + start = 0 + stop = resource_size + else: + # byte-range-spec: first-byte-pos "-" [last-byte-pos] + start, stop = val.split("-", 1) + start = int(start) + # the +1 is here since we want the stopping point to be exclusive, whereas in + # the HTTP spec, the last-byte-pos is inclusive + stop = int(stop) + 1 if stop else resource_size + if start >= stop: + return None + + ranges.append((start, stop)) + + return ranges + + +class RangedFileReader: + """ + Wraps a file like object with an iterator that runs over part (or all) of + the file defined by start and stop. Blocks of block_size will be returned + from the starting position, up to, but not including the stop point. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + + block_size = 8192 + + def __init__(self, file_like, start=0, stop=float("inf"), block_size=None): + self.f = file_like + self.block_size = block_size or RangedFileReader.block_size + self.start = start + self.stop = stop + + def __iter__(self): + self.f.seek(self.start) + position = self.start + while position < self.stop: + data = self.f.read(min(self.block_size, self.stop - position)) + if not data: + break + + yield data + position += self.block_size diff --git a/archivebox/misc/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py new file mode 100644 index 0000000000..d33f9307cb --- /dev/null +++ b/archivebox/misc/shell_welcome_message.py @@ -0,0 +1,63 @@ +__package__ = "archivebox.core" + +from rich.console import Console + +# helpful imports that make the shell easier to work with out-of-the-box: +import re # noqa +import os # noqa +import sys # noqa +import json # noqa +import psutil # noqa +import django # noqa +import pydantic # noqa +import requests # noqa +import subprocess # noqa +import archivebox +from benedict import benedict # noqa +from django.utils import timezone # noqa +from datetime import datetime, timedelta # noqa +from django.conf import settings # noqa + +from archivebox import CONSTANTS # noqa +from archivebox.cli import * # noqa +from archivebox.config.configset import get_config + +CONFIG = get_config() + +if __name__ == "__main__": + # load the rich extension for ipython for pretty printing + # https://rich.readthedocs.io/en/stable/introduction.html#ipython-extension + get_ipython().run_line_magic("load_ext", "rich") # type: ignore # noqa + + # prnt = print with cropping using ... ellipsis for helptext that doesn't matter that much + console = Console() + prnt = lambda *args, **kwargs: console.print(*args, overflow="ellipsis", soft_wrap=True, **kwargs) + + # print the welcome message + prnt("[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]") + prnt("[yellow4]# ArchiveBox Imports[/]") + prnt("[yellow4]import archivebox[/]") + prnt("[yellow4]from archivebox.cli import *[/]") + prnt() + + if console.width >= 80: + from archivebox.misc.logging import rainbow + + prnt(rainbow(archivebox.ASCII_LOGO)) + + prnt("[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!") + prnt( + " [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]", + ) + prnt( + " [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]", + ) + prnt() + prnt(" :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]") + prnt( + " add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]", + ) + prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]') + prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]') + prnt(" snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]") + prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]') diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py new file mode 100644 index 0000000000..4d3e7b12a1 --- /dev/null +++ b/archivebox/misc/system.py @@ -0,0 +1,226 @@ +__package__ = "archivebox.misc" + + +import os +import signal +import shutil +import sys + +from json import dump +from pathlib import Path +from subprocess import PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired + +from atomicwrites import atomic_write as lib_atomic_write + +from archivebox.config.common import STORAGE_CONFIG +from archivebox.misc.util import enforce_types, ExtendedEncoder + +IS_WINDOWS = os.name == "nt" + + +def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): + """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective + Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py + """ + + cmd = [str(arg) for arg in cmd] + + if input is not None: + if kwargs.get("stdin") is not None: + raise ValueError("stdin and input arguments may not both be used.") + kwargs["stdin"] = PIPE + + if capture_output: + if ("stdout" in kwargs) or ("stderr" in kwargs): + raise ValueError("stdout and stderr arguments may not be used with capture_output.") + kwargs["stdout"] = PIPE + kwargs["stderr"] = PIPE + + pgid = None + try: + if isinstance(cmd, (list, tuple)) and cmd[0].endswith(".py"): + PYTHON_BINARY = sys.executable + cmd = (PYTHON_BINARY, *cmd) + + with Popen(cmd, *args, start_new_session=start_new_session, text=text, **kwargs) as process: + pgid = os.getpgid(process.pid) + try: + stdout, stderr = process.communicate(input, timeout=timeout) + except TimeoutExpired as exc: + process.kill() + if IS_WINDOWS: + # Windows accumulates the output in a single blocking + # read() call run on child threads, with the timeout + # being done in a join() on those threads. communicate() + # _after_ kill() is required to collect that and add it + # to the exception. + timed_out_stdout, timed_out_stderr = process.communicate() + exc.stdout = timed_out_stdout.encode() if isinstance(timed_out_stdout, str) else timed_out_stdout + exc.stderr = timed_out_stderr.encode() if isinstance(timed_out_stderr, str) else timed_out_stderr + else: + # POSIX _communicate already populated the output so + # far into the TimeoutExpired exception. + process.wait() + raise + except BaseException: # Including KeyboardInterrupt, communicate handled that. + process.kill() + # We don't call process.wait() as .__exit__ does that for us. + raise + + retcode = process.poll() + if check and retcode: + raise CalledProcessError( + retcode, + process.args, + output=stdout, + stderr=stderr, + ) + finally: + # force kill any straggler subprocesses that were forked from the main proc + try: + if pgid is not None: + os.killpg(pgid, signal.SIGINT) + except Exception: + pass + + return CompletedProcess(process.args, retcode or 0, stdout, stderr) + + +@enforce_types +def atomic_write(path: Path | str, contents: dict | str | bytes, overwrite: bool = True) -> None: + """Safe atomic write to filesystem by writing to temp file + atomic rename""" + + mode = "wb+" if isinstance(contents, bytes) else "w" + encoding = None if isinstance(contents, bytes) else "utf-8" # enforce utf-8 on all text writes + + # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') + try: + with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f: + if isinstance(contents, dict): + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): + f.write(contents) + except OSError as e: + if STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES: + print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})") + print( + " You can store the archive/ subfolder on a hard drive or network share that doesn't support support synchronous writes,", + ) + print( + " but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.", + ) + raise SystemExit(1) + + # retry the write without forcing FSYNC (aka atomic mode) + with open(path, mode=mode, encoding=encoding) as f: + if isinstance(contents, dict): + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): + f.write(contents) + + # set file permissions + os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8)) + + +@enforce_types +def chmod_file(path: str, cwd: str = "") -> None: + """chmod -R /""" + + root = Path(cwd or os.getcwd()) / path + if not os.access(root, os.R_OK): + raise Exception(f"Failed to chmod: {path} does not exist (did the previous step fail?)") + + if not root.is_dir(): + # path is just a plain file + os.chmod(root, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8)) + else: + for subpath in Path(path).glob("**/*"): + if subpath.is_dir(): + # directories need execute permissions to be able to list contents + os.chmod(subpath, int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) + else: + os.chmod(subpath, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8)) + + +@enforce_types +def copy_and_overwrite(from_path: str | Path, to_path: str | Path): + """copy a given file or directory to a given path, overwriting the destination""" + + assert os.access(from_path, os.R_OK) + + if Path(from_path).is_dir(): + shutil.rmtree(to_path, ignore_errors=True) + shutil.copytree(from_path, to_path) + else: + with open(from_path, "rb") as src: + contents = src.read() + atomic_write(to_path, contents) + + +@enforce_types +def get_dir_size(path: str | Path, recursive: bool = True, pattern: str | None = None) -> tuple[int, int, int]: + """get the total disk size of a given directory, optionally summing up + recursively and limiting to a given filter list + """ + num_bytes, num_dirs, num_files = 0, 0, 0 + try: + for entry in os.scandir(path): + if (pattern is not None) and (pattern not in entry.path): + continue + if entry.is_dir(follow_symlinks=False): + if not recursive: + continue + num_dirs += 1 + bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) + num_bytes += bytes_inside + num_dirs += dirs_inside + num_files += files_inside + else: + num_bytes += entry.stat(follow_symlinks=False).st_size + num_files += 1 + except OSError: + # e.g. FileNameTooLong or other error while trying to read dir + pass + return num_bytes, num_dirs, num_files + + +class suppress_output: + """ + A context manager for doing a "deep suppression" of stdout and stderr in + Python, i.e. will suppress all print, even if the print originates in a + compiled C/Fortran sub-function. + + This will not suppress raised exceptions, since exceptions are printed + to stderr just before a script exits, and after the context manager has + exited (at least, I think that is why it lets exceptions through). + + with suppress_stdout_stderr(): + rogue_function() + """ + + def __init__(self, stdout=True, stderr=True): + # Open a pair of null files + # Save the actual stdout (1) and stderr (2) file descriptors. + self.stdout, self.stderr = stdout, stderr + if stdout: + self.null_stdout = os.open(os.devnull, os.O_RDWR) + self.real_stdout = os.dup(1) + if stderr: + self.null_stderr = os.open(os.devnull, os.O_RDWR) + self.real_stderr = os.dup(2) + + def __enter__(self): + # Assign the null pointers to stdout and stderr. + if self.stdout: + os.dup2(self.null_stdout, 1) + if self.stderr: + os.dup2(self.null_stderr, 2) + + def __exit__(self, *_): + # Re-assign the real stdout/stderr back to (1) and (2) + if self.stdout: + os.dup2(self.real_stdout, 1) + os.close(self.null_stdout) + if self.stderr: + os.dup2(self.real_stderr, 2) + os.close(self.null_stderr) diff --git a/archivebox/misc/toml_util.py b/archivebox/misc/toml_util.py new file mode 100644 index 0000000000..cd8a6e22f8 --- /dev/null +++ b/archivebox/misc/toml_util.py @@ -0,0 +1,120 @@ +from typing import Any, cast +from collections.abc import Callable + +import json +import ast +import inspect +import toml +import re +import configparser + +from pathlib import Path, PosixPath + +from pydantic.json_schema import GenerateJsonSchema +from pydantic_core import to_jsonable_python + +JSONValue = str | bool | int | None | list["JSONValue"] + +TOML_HEADER = "# Converted from INI to TOML format: https://toml.io/en/\n\n" + + +def load_ini_value(val: str) -> JSONValue: + """Convert lax INI values into strict TOML-compliant (JSON) values""" + if val.lower() in ("true", "yes", "1"): + return True + if val.lower() in ("false", "no", "0"): + return False + if val.isdigit(): + return int(val) + + try: + return ast.literal_eval(val) + except Exception: + pass + + try: + return json.loads(val) + except Exception: + pass + + return val + + +def convert(ini_str: str) -> str: + """Convert a string of INI config into its TOML equivalent (warning: strips comments)""" + + config = configparser.ConfigParser() + setattr(config, "optionxform", str) # capitalize key names + config.read_string(ini_str) + + # Initialize an empty dictionary to store the TOML representation + toml_dict = {} + + # Iterate over each section in the INI configuration + for section in config.sections(): + toml_dict[section] = {} + + # Iterate over each key-value pair in the section + for key, value in config.items(section): + parsed_value = load_ini_value(value) + + # Convert the parsed value to its TOML-compatible JSON representation + toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value) + + # Build the TOML string + toml_str = TOML_HEADER + for section, items in toml_dict.items(): + toml_str += f"[{section}]\n" + for key, value in items.items(): + toml_str += f"{key} = {value}\n" + toml_str += "\n" + + return toml_str.strip() + + +class JSONSchemaWithLambdas(GenerateJsonSchema): + """ + Encode lambda functions in default values properly. + Usage: + >>> json.dumps(value, encoder=JSONSchemaWithLambdas()) + """ + + def encode_default(self, dft: Any) -> Any: + config = self._config + if isinstance(dft, Callable): + return "{{lambda " + inspect.getsource(dft).split("=lambda ")[-1].strip()[:-1] + "}}" + return to_jsonable_python( + dft, + timedelta_mode=config.ser_json_timedelta, + bytes_mode=config.ser_json_bytes, + serialize_unknown=True, + ) + + # for computed_field properties render them like this instead: + # inspect.getsource(field.wrapped_property.fget).split('def ', 1)[-1].split('\n', 1)[-1].strip().strip('return '), + + +def better_toml_dump_str(val: Any) -> str: + try: + dump_str = cast(Callable[[Any], str], getattr(toml.encoder, "_dump_str")) + return dump_str(val) + except Exception: + # if we hit any of toml's numerous encoding bugs, + # fall back to using json representation of string + return json.dumps(str(val)) + + +class CustomTOMLEncoder(toml.encoder.TomlEncoder): + """ + Custom TomlEncoder to work around https://github.com/uiri/toml's many encoding bugs. + More info: https://github.com/fabiocaccamo/python-benedict/issues/439 + >>> toml.dumps(value, encoder=CustomTOMLEncoder()) + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs) + dump_funcs[Path] = lambda x: json.dumps(str(x)) + dump_funcs[PosixPath] = lambda x: json.dumps(str(x)) + dump_funcs[str] = better_toml_dump_str + dump_funcs[re.RegexFlag] = better_toml_dump_str diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py new file mode 100644 index 0000000000..648cdb88d2 --- /dev/null +++ b/archivebox/misc/util.py @@ -0,0 +1,730 @@ +__package__ = "archivebox.misc" + +import re +import requests +import json as pyjson +import http.cookiejar +from decimal import Decimal, InvalidOperation +from dateparser import parse as dateparser + +from typing import Any +from collections.abc import Callable +from pathlib import Path +from inspect import signature +from functools import wraps +from hashlib import sha256 +from urllib.parse import urlparse, quote, unquote +from html import escape, unescape +from datetime import datetime, timezone +from requests.exceptions import RequestException, ReadTimeout + +from base32_crockford import encode as base32_encode +from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding + +try: + import chardet # type:ignore + + detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"] +except ImportError: + detect_encoding = lambda rawdata: "utf-8" + + +from archivebox.config.constants import CONSTANTS + +from .logging import COLOR_DICT + + +### Parsing Helpers + +# All of these are (str) -> str +# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing +scheme = lambda url: urlparse(url).scheme.lower() +without_scheme = lambda url: urlparse(url)._replace(scheme="").geturl().strip("//") +without_query = lambda url: urlparse(url)._replace(query="").geturl().strip("//") +without_fragment = lambda url: urlparse(url)._replace(fragment="").geturl().strip("//") +without_path = lambda url: urlparse(url)._replace(path="", fragment="", query="").geturl().strip("//") +path = lambda url: urlparse(url).path +basename = lambda url: urlparse(url).path.rsplit("/", 1)[-1] +domain = lambda url: urlparse(url).netloc +query = lambda url: urlparse(url).query +fragment = lambda url: urlparse(url).fragment +extension = lambda url: basename(url).rsplit(".", 1)[-1].lower() if "." in basename(url) else "" +base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links + +without_www = lambda url: url.replace("://www.", "://", 1) +without_trailing_slash = lambda url: url[:-1] if url[-1] == "/" else url.replace("/?", "?") +hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode("utf-8")).hexdigest(), 16))[:20] + +urlencode = lambda s: s and quote(s, encoding="utf-8", errors="replace") +urldecode = lambda s: s and unquote(s) +htmlencode = lambda s: s and escape(s, quote=True) +htmldecode = lambda s: s and unescape(s) + + +def short_ts(ts: Any) -> str | None: + parsed = parse_date(ts) + return None if parsed is None else str(parsed.timestamp()).split(".")[0] + + +def ts_to_date_str(ts: Any) -> str | None: + parsed = parse_date(ts) + return None if parsed is None else parsed.strftime("%Y-%m-%d %H:%M") + + +def ts_to_iso(ts: Any) -> str | None: + parsed = parse_date(ts) + return None if parsed is None else parsed.isoformat() + + +COLOR_REGEX = re.compile(r"\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m") + + +# https://mathiasbynens.be/demo/url-regex +URL_REGEX = re.compile( + r"(?=(" + r"http[s]?://" # start matching from allowed schemes + r"(?:[a-zA-Z]|[0-9]" # followed by allowed alphanum characters + r"|[-_$@.&+!*\(\),]" # or allowed symbols (keep hyphen first to match literal hyphen) + r"|[^\u0000-\u007F])+" # or allowed unicode bytes + r'[^\]\[<>"\'\s]+' # stop parsing at these symbols + r"))", + re.IGNORECASE | re.UNICODE, +) + +QUOTE_DELIMITERS = ( + '"', + "'", + "`", + "“", + "”", + "‘", + "’", +) +QUOTE_ENTITY_DELIMITERS = ( + """, + """, + """, + "'", + "'", + "'", +) +URL_ENTITY_REPLACEMENTS = ( + ("&", "&"), + ("&", "&"), + ("&", "&"), +) + +FILESIZE_UNITS: dict[str, int] = { + "": 1, + "b": 1, + "byte": 1, + "bytes": 1, + "k": 1024, + "kb": 1024, + "kib": 1024, + "m": 1024**2, + "mb": 1024**2, + "mib": 1024**2, + "g": 1024**3, + "gb": 1024**3, + "gib": 1024**3, + "t": 1024**4, + "tb": 1024**4, + "tib": 1024**4, +} + + +def sanitize_extracted_url(url: str) -> str: + """Trim quote garbage and dangling prose punctuation from an extracted URL candidate.""" + cleaned = (url or "").strip() + if not cleaned: + return cleaned + + lower_cleaned = cleaned.lower() + cut_index = len(cleaned) + + for delimiter in QUOTE_DELIMITERS: + found_index = cleaned.find(delimiter) + if found_index != -1: + cut_index = min(cut_index, found_index) + + for delimiter in QUOTE_ENTITY_DELIMITERS: + found_index = lower_cleaned.find(delimiter) + if found_index != -1: + cut_index = min(cut_index, found_index) + + cleaned = cleaned[:cut_index].strip() + lower_cleaned = cleaned.lower() + for entity, replacement in URL_ENTITY_REPLACEMENTS: + while entity in lower_cleaned: + entity_index = lower_cleaned.find(entity) + cleaned = cleaned[:entity_index] + replacement + cleaned[entity_index + len(entity) :] + lower_cleaned = cleaned.lower() + + cleaned = cleaned.rstrip(".,;:!?\\'\"") + cleaned = cleaned.rstrip('"') + + return cleaned + + +def parens_are_matched(string: str, open_char="(", close_char=")"): + """check that all parentheses in a string are balanced and nested properly""" + count = 0 + for c in string: + if c == open_char: + count += 1 + elif c == close_char: + count -= 1 + if count < 0: + return False + return count == 0 + + +def fix_url_from_markdown(url_str: str) -> str: + """ + cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax + helpful to fix URLs parsed from markdown e.g. + input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext + result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def + + IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses + e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url' + in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren) + This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser. + """ + trimmed_url = url_str + + # cut off one trailing character at a time + # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c + while trimmed_url and not parens_are_matched(trimmed_url): + trimmed_url = trimmed_url[:-1] + + # make sure trimmed url is still valid + if any(match == trimmed_url for match in re.findall(URL_REGEX, trimmed_url)): + return trimmed_url + + return url_str + + +def split_comma_separated_urls(url: str): + offset = 0 + while True: + http_index = url.find("http://", 1) + https_index = url.find("https://", 1) + next_indices = [idx for idx in (http_index, https_index) if idx != -1] + if not next_indices: + yield offset, url + return + + next_index = min(next_indices) + if url[next_index - 1] != ",": + yield offset, url + return + + yield offset, url[: next_index - 1] + offset += next_index + url = url[next_index:] + + +def find_all_urls(urls_str: str): + skipped_starts = set() + for match in re.finditer(URL_REGEX, urls_str): + if match.start() in skipped_starts: + continue + + cleaned_match = sanitize_extracted_url(fix_url_from_markdown(match.group(1))) + for offset, url in split_comma_separated_urls(cleaned_match): + if offset: + skipped_starts.add(match.start() + offset) + yield url + + +def parse_filesize_to_bytes(value: str | int | float | None) -> int: + """ + Parse a byte count from an integer or human-readable string like 45mb or 2 GB. + """ + if value is None: + return 0 + + if isinstance(value, bool): + raise ValueError("Size value must be an integer or size string.") + + if isinstance(value, int): + return value + + if isinstance(value, float): + if not value.is_integer(): + raise ValueError("Size value must resolve to a whole number of bytes.") + return int(value) + + raw_value = str(value).strip() + if not raw_value: + return 0 + + if raw_value.isdigit(): + return int(raw_value) + + match = re.fullmatch(r"(?i)(\d+(?:\.\d+)?)\s*([a-z]+)", raw_value) + if not match: + raise ValueError(f"Invalid size value: {value}") + + amount_str, unit_str = match.groups() + multiplier = FILESIZE_UNITS.get(unit_str.lower()) + if multiplier is None: + raise ValueError(f"Unknown size unit: {unit_str}") + + try: + amount = Decimal(amount_str) + except InvalidOperation as err: + raise ValueError(f"Invalid size value: {value}") from err + + return int(amount * multiplier) + + +def is_static_file(url: str): + # TODO: the proper way is with MIME type detection + ext, not only extension + return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS + + +def enforce_types(func): + """ + Enforce function arg and kwarg types at runtime using its python3 type hints + Simpler version of pydantic @validate_call decorator + """ + # TODO: check return type as well + + @wraps(func) + def typechecked_function(*args, **kwargs): + sig = signature(func) + + def check_argument_type(arg_key, arg_val): + try: + annotation = sig.parameters[arg_key].annotation + except KeyError: + annotation = None + + if annotation is not None and annotation.__class__ is type: + if not isinstance(arg_val, annotation): + raise TypeError( + "{}(..., {}: {}) got unexpected {} argument {}={}".format( + func.__name__, + arg_key, + annotation.__name__, + type(arg_val).__name__, + arg_key, + str(arg_val)[:64], + ), + ) + + # check args + for arg_val, arg_key in zip(args, sig.parameters): + check_argument_type(arg_key, arg_val) + + # check kwargs + for arg_key, arg_val in kwargs.items(): + check_argument_type(arg_key, arg_val) + + return func(*args, **kwargs) + + return typechecked_function + + +def docstring(text: str | None): + """attach the given docstring to the decorated function""" + + def decorator(func): + if text: + func.__doc__ = text + return func + + return decorator + + +@enforce_types +def str_between(string: str, start: str, end: str | None = None) -> str: + """(12345, , ) -> 12345""" + + content = string.split(start, 1)[-1] + if end is not None: + content = content.rsplit(end, 1)[0] + + return content + + +@enforce_types +def parse_date(date: Any) -> datetime | None: + """Parse unix timestamps, iso format, and human-readable strings""" + + if date is None: + return None + + if isinstance(date, datetime): + if date.tzinfo is None: + return date.replace(tzinfo=timezone.utc) + + offset = date.utcoffset() + assert offset == datetime.now(timezone.utc).utcoffset(), "Refusing to load a non-UTC date!" + return date + + if isinstance(date, (float, int)): + date = str(date) + + if isinstance(date, str): + normalized = date.strip() + if not normalized: + raise ValueError(f"Tried to parse invalid date string! {date}") + + try: + return datetime.fromtimestamp(float(normalized), tz=timezone.utc) + except (TypeError, ValueError, OSError): + pass + + try: + iso_date = normalized.replace("Z", "+00:00") + parsed_date = datetime.fromisoformat(iso_date) + if parsed_date.tzinfo is None: + return parsed_date.replace(tzinfo=timezone.utc) + return parsed_date.astimezone(timezone.utc) + except ValueError: + pass + + parsed_date = dateparser(normalized, settings={"TIMEZONE": "UTC"}) + if parsed_date is None: + raise ValueError(f"Tried to parse invalid date string! {date}") + return parsed_date.astimezone(timezone.utc) + + raise ValueError(f"Tried to parse invalid date! {date}") + + +@enforce_types +def download_url(url: str, timeout: int | None = None) -> str: + """Download the contents of a remote url and return the text""" + + from archivebox.config.common import ARCHIVING_CONFIG + + timeout = timeout or ARCHIVING_CONFIG.TIMEOUT + session = requests.Session() + + if ARCHIVING_CONFIG.COOKIES_FILE and Path(ARCHIVING_CONFIG.COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(ARCHIVING_CONFIG.COOKIES_FILE) + cookie_jar.load(ignore_discard=True, ignore_expires=True) + for cookie in cookie_jar: + if cookie.value is not None: + session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) + + response = session.get( + url, + headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, + timeout=timeout, + ) + + content_type = response.headers.get("Content-Type", "") + encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text) + + if encoding is not None: + response.encoding = encoding + + try: + return response.text + except UnicodeDecodeError: + # if response is non-test (e.g. image or other binary files), just return the filename instead + return url.rsplit("/", 1)[-1] + + +@enforce_types +def get_headers(url: str, timeout: int | None = None) -> str: + """Download the contents of a remote url and return the headers""" + # TODO: get rid of this and use an abx pluggy hook instead + + from archivebox.config.common import ARCHIVING_CONFIG + + timeout = timeout or ARCHIVING_CONFIG.TIMEOUT + + try: + response = requests.head( + url, + headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, + timeout=timeout, + allow_redirects=True, + ) + if response.status_code >= 400: + raise RequestException + except ReadTimeout: + raise + except RequestException: + response = requests.get( + url, + headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, + timeout=timeout, + stream=True, + ) + + return pyjson.dumps( + { + "URL": url, + "Status-Code": response.status_code, + "Elapsed": response.elapsed.total_seconds() * 1000, + "Encoding": str(response.encoding), + "Apparent-Encoding": response.apparent_encoding, + **dict(response.headers), + }, + indent=4, + ) + + +@enforce_types +def ansi_to_html(text: str) -> str: + """ + Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html + Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though. + """ + + TEMPLATE = '
' + text = text.replace("[m", "
") + + def single_sub(match): + argsdict = match.groupdict() + if argsdict["arg_3"] is None: + if argsdict["arg_2"] is None: + _, color = 0, argsdict["arg_1"] + else: + _, color = argsdict["arg_1"], argsdict["arg_2"] + else: + _, color = argsdict["arg_3"], argsdict["arg_2"] + + return TEMPLATE.format(COLOR_DICT[color][0]) + + return COLOR_REGEX.sub(single_sub, text) + + +@enforce_types +def dedupe(options: list[str]) -> list[str]: + """ + Deduplicates the given CLI args by key=value. Options that come later override earlier. + """ + deduped = {} + + for option in options: + key = option.split("=")[0] + deduped[key] = option + + return list(deduped.values()) + + +class ExtendedEncoder(pyjson.JSONEncoder): + """ + Extended json serializer that supports serializing several model + fields and objects + """ + + def default(self, o): + cls_name = o.__class__.__name__ + + if hasattr(o, "_asdict"): + return o._asdict() + + elif isinstance(o, bytes): + return o.decode() + + elif isinstance(o, datetime): + return o.isoformat() + + elif isinstance(o, Exception): + return f"{o.__class__.__name__}: {o}" + + elif isinstance(o, Path): + return str(o) + + elif cls_name in ("dict_items", "dict_keys", "dict_values"): + return list(o) + + elif isinstance(o, Callable): + return str(o) + + # Try dict/list conversion as fallback + try: + return dict(o) + except Exception: + pass + + try: + return list(o) + except Exception: + pass + + try: + return str(o) + except Exception: + pass + + return pyjson.JSONEncoder.default(self, o) + + +@enforce_types +def to_json(obj: Any, indent: int | None = 4, sort_keys: bool = True) -> str: + """Serialize object to JSON string with extended type support""" + return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) + + +### URL PARSING TESTS / ASSERTIONS + +# Check that plain text regex URL parsing works as expected +# this is last-line-of-defense to make sure the URL_REGEX isn't +# misbehaving due to some OS-level or environment level quirks (e.g. regex engine / cpython / locale differences) +# the consequences of bad URL parsing could be disastrous and lead to many +# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking + +assert fix_url_from_markdown("http://example.com/a(b)c).x(y)z") == "http://example.com/a(b)c" +assert ( + fix_url_from_markdown("https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext") + == "https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def" +) + +URL_REGEX_TESTS = [ + ("https://example.com", ["https://example.com"]), + ("https://sweeting.me,https://google.com", ["https://sweeting.me", "https://google.com"]), + ( + "http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234", + ["http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234"], + ), + ( + "https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ã‚ĸ@ã‚ĩ!ト&hashtags=ã‚ĸ%ã‚Ē,元+ã‚ĸ.ã‚ĸ-ã‚Ē_イ*ã‚ˇ$ロ abc", + [ + "https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ã‚ĸ@ã‚ĩ!ト&hashtags=ã‚ĸ%ã‚Ē,元+ã‚ĸ.ã‚ĸ-ã‚Ē_イ*ã‚ˇ$ロ", + "https://akaao.success-corp.co.jp&text=ã‚ĸ@ã‚ĩ!ト&hashtags=ã‚ĸ%ã‚Ē,元+ã‚ĸ.ã‚ĸ-ã‚Ē_イ*ã‚ˇ$ロ", + ], + ), + ( + ' abc', + [ + "https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ã‚ĸ@ã‚ĩ!ト?hashtags=ã‚ĸ%ã‚Ē,元+ã‚ĸ&abc=.ã‚ĸ-ã‚Ē_イ*ã‚ˇ$ロ", + "https://akaao.success-corp.co.jp&text=ã‚ĸ@ã‚ĩ!ト?hashtags=ã‚ĸ%ã‚Ē,元+ã‚ĸ&abc=.ã‚ĸ-ã‚Ē_イ*ã‚ˇ$ロ", + ], + ), + ("///a", []), + ("http://", []), + ("http://../", ["http://../"]), + ("http://-error-.invalid/", ["http://-error-.invalid/"]), + ("https://a(b)c+1#2?3&4/", ["https://a(b)c+1#2?3&4/"]), + ("http://⤉ā¤Ļā¤žā¤šā¤°ā¤Ŗ.ā¤Ē⤰āĨ€ā¤•āĨā¤ˇā¤ž", ["http://⤉ā¤Ļā¤žā¤šā¤°ā¤Ŗ.ā¤Ē⤰āĨ€ā¤•āĨā¤ˇā¤ž"]), + ("http://䞋子.æĩ‹č¯•", ["http://䞋子.æĩ‹č¯•"]), + ("http://➡.ws/䨚 htps://abc.1243?234", ["http://➡.ws/䨚"]), + ('http://⌘.ws">https://exa+mple.com//:abc ', ["http://⌘.ws", "https://exa+mple.com//:abc"]), + ("http://Ų…ØĢØ§Ų„.ØĨØŽØĒØ¨Ø§Øą/abc?def=ØĒ&ب=abc#abc=234", ["http://Ų…ØĢØ§Ų„.ØĨØŽØĒØ¨Ø§Øą/abc?def=ØĒ&ب=abc#abc=234"]), + ("http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c'om", ["http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c"]), + ( + "http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3", + ["http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3", "http://ex.co:19/a?_d=4#-a=2.3"], + ), + ("http://code.google.com/events/#&product=browser", ["http://code.google.com/events/#&product=browser"]), + ("http://foo.bar?q=Spaces should be encoded", ["http://foo.bar?q=Spaces"]), + ("http://foo.com/blah_(wikipedia)#c(i)t[e]-1", ["http://foo.com/blah_(wikipedia)#c(i)t"]), + ("http://foo.com/(something)?after=parens", ["http://foo.com/(something)?after=parens"]), + ("http://foo.com/unicode_(âœĒ)_in_parens) abc", ["http://foo.com/unicode_(âœĒ)_in_parens"]), + ("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]), + ("[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff", ["http://a.b/?q=(Test)%20U"]), + ("[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123", ["http://a.b/?q=(Test)%20U", "https://abc+123"]), + ("[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3", ["http://a.b/?q=(Test)%20U", "https://a(b)c+12"]), + ("[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3", ["http://a.b/?q=(Test)a", "https://a(b)c+12"]), + ("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]), +] +for urls_str, expected_url_matches in URL_REGEX_TESTS: + url_matches = list(find_all_urls(urls_str)) + assert url_matches == expected_url_matches, "FAILED URL_REGEX CHECK!" + + +# More test cases +_test_url_strs = { + "example.com": 0, + "/example.com": 0, + "//example.com": 0, + ":/example.com": 0, + "://example.com": 0, + "htt://example8.com": 0, + "/htt://example.com": 0, + "https://example": 1, + "https://localhost/2345": 1, + "https://localhost:1234/123": 1, + "://": 0, + "https://": 0, + "http://": 0, + "ftp://": 0, + "ftp://example.com": 0, + "https://example.com": 1, + "https://example.com/": 1, + "https://a.example.com": 1, + "https://a.example.com/": 1, + "https://a.example.com/what/is/happening.html": 1, + "https://a.example.com/what/ís/happening.html": 1, + "https://a.example.com/what/is/happening.html?what=1&2%20b#hÃļw-about-this=1a": 1, + "https://a.example.com/what/is/happÊning/?what=1&2%20b#how-aboÃŧt-this=1a": 1, + "HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b": 1, + "https://example.com/?what=1#how-about-this=1&2%20baf": 1, + "https://example.com?what=1#how-about-this=1&2%20baf": 1, + "http://example7.com": 1, + "https://": 0, + "https://[test]": 0, + 'http://"test"': 0, + "http://'test'": 0, + "[https://example8.com/what/is/this.php?what=1]": 1, + "[and http://example9.com?what=1&other=3#and-thing=2]": 1, + 'https://example10.com#and-thing=2 "': 1, + 'abcdef': 1, + "sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi": 1, + "http://examplehttp://15.badc": 2, + "https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://": 2, + "[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)": 3, +} +for url_str, num_urls in _test_url_strs.items(): + assert len(list(find_all_urls(url_str))) == num_urls, f"{url_str} does not contain {num_urls} urls" + + +### Chrome Helpers + + +def chrome_cleanup(): + """ + Cleans up any state or runtime files that Chrome leaves behind when killed by + a timeout or other error. Handles: + - All persona chrome_user_data directories (via Persona.cleanup_chrome_all()) + - Explicit CHROME_USER_DATA_DIR from config + - Legacy Docker chromium path + """ + import os + from pathlib import Path + from archivebox.config.permissions import IN_DOCKER + + # Clean up all persona chrome directories using Persona class + try: + from archivebox.personas.models import Persona + + # Clean up all personas + Persona.cleanup_chrome_all() + + # Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set + # (in case it's a custom path not under PERSONAS_DIR) + from archivebox.config.configset import get_config + + config = get_config() + chrome_user_data_dir = config.get("CHROME_USER_DATA_DIR") + if chrome_user_data_dir: + singleton_lock = Path(chrome_user_data_dir) / "SingletonLock" + if os.path.lexists(singleton_lock): + try: + singleton_lock.unlink() + except OSError: + pass + except Exception: + pass # Persona/config not available during early startup + + # Legacy Docker cleanup (for backwards compatibility) + if IN_DOCKER: + singleton_lock = "/home/archivebox/.config/chromium/SingletonLock" + if os.path.lexists(singleton_lock): + try: + os.remove(singleton_lock) + except OSError: + pass diff --git a/archivebox/mypy.ini b/archivebox/mypy.ini deleted file mode 100644 index b1b4489ae4..0000000000 --- a/archivebox/mypy.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -plugins = - mypy_django_plugin.main diff --git a/archivebox/package.json b/archivebox/package.json deleted file mode 120000 index 4e26811d41..0000000000 --- a/archivebox/package.json +++ /dev/null @@ -1 +0,0 @@ -../package.json \ No newline at end of file diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py deleted file mode 100644 index 2451f0f57e..0000000000 --- a/archivebox/parsers/__init__.py +++ /dev/null @@ -1,237 +0,0 @@ -""" -Everything related to parsing links from input sources. - -For a list of supported services, see the README.md. -For examples of supported import formats see tests/. -""" - -__package__ = 'archivebox.parsers' - -import re -from io import StringIO - -from typing import IO, Tuple, List, Optional -from datetime import datetime, timezone -from pathlib import Path - -from ..system import atomic_write -from ..config import ( - ANSI, - OUTPUT_DIR, - SOURCES_DIR_NAME, - TIMEOUT, - stderr, - hint, -) -from ..util import ( - basename, - htmldecode, - download_url, - enforce_types, - URL_REGEX, -) -from ..index.schema import Link -from ..logging_util import TimedProgress, log_source_saved - -from . import pocket_api -from . import wallabag_atom -from . import pocket_html -from . import pinboard_rss -from . import shaarli_rss -from . import medium_rss - -from . import netscape_html -from . import generic_rss -from . import generic_json -from . import generic_html -from . import generic_txt -from . import url_list - - -PARSERS = { - # Specialized parsers - pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER), - wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER), - pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER), - pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER), - shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER), - medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER), - - # General parsers - netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), - generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), - generic_json.KEY: (generic_json.NAME, generic_json.PARSER), - generic_html.KEY: (generic_html.NAME, generic_html.PARSER), - - # Catchall fallback parser - generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER), - - # Explicitly specified parsers - url_list.KEY: (url_list.NAME, url_list.PARSER), -} - - -@enforce_types -def parse_links_memory(urls: List[str], root_url: Optional[str]=None): - """ - parse a list of URLS without touching the filesystem - """ - - timer = TimedProgress(TIMEOUT * 4) - #urls = list(map(lambda x: x + "\n", urls)) - file = StringIO() - file.writelines(urls) - file.name = "io_string" - links, parser = run_parser_functions(file, timer, root_url=root_url) - timer.end() - - if parser is None: - return [], 'Failed to parse' - return links, parser - - -@enforce_types -def parse_links(source_file: str, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], str]: - """parse a list of URLs with their metadata from an - RSS feed, bookmarks export, or text file - """ - - timer = TimedProgress(TIMEOUT * 4) - with open(source_file, 'r', encoding='utf-8') as file: - links, parser = run_parser_functions(file, timer, root_url=root_url, parser=parser) - - timer.end() - if parser is None: - return [], 'Failed to parse' - return links, parser - - -def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]: - most_links: List[Link] = [] - best_parser_name = None - - if parser != "auto": - parser_name, parser_func = PARSERS[parser] - parsed_links = list(parser_func(to_parse, root_url=root_url)) - if not parsed_links: - stderr() - stderr(f'[X] No links found using {parser_name} parser', color='red') - hint('Try a different parser or double check the input?') - stderr() - timer.end() - return parsed_links, parser_name - - for parser_id in PARSERS: - parser_name, parser_func = PARSERS[parser_id] - try: - parsed_links = list(parser_func(to_parse, root_url=root_url)) - if not parsed_links: - raise Exception(f'No links found using {parser_name} parser') - - # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed') - if len(parsed_links) > len(most_links): - most_links = parsed_links - best_parser_name = parser_name - - except Exception as err: # noqa - # Parsers are tried one by one down the list, and the first one - # that succeeds is used. To debug why a certain parser was not used - # due to python error or format incompatibility, uncomment this line: - - # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) - # raise - pass - timer.end() - return most_links, best_parser_name - - -@enforce_types -def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str: - ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0] - source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts)) - atomic_write(source_path, raw_text) - log_source_saved(source_file=source_path) - return source_path - - -@enforce_types -def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str: - """download a given url's content into output/sources/domain-.txt""" - ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0] - source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts)) - - if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - # Source is a URL that needs to be downloaded - print(f' > Downloading {path} contents') - timer = TimedProgress(timeout, prefix=' ') - try: - raw_source_text = download_url(path, timeout=timeout) - raw_source_text = htmldecode(raw_source_text) - timer.end() - except Exception as e: - timer.end() - print('{}[!] Failed to download {}{}\n'.format( - ANSI['red'], - path, - ANSI['reset'], - )) - print(' ', e) - raise SystemExit(1) - - else: - # Source is a path to a local file on the filesystem - with open(path, 'r') as f: - raw_source_text = f.read() - - atomic_write(source_path, raw_source_text) - - log_source_saved(source_file=source_path) - - return source_path - - -# Check that plain text regex URL parsing works as expected -# this is last-line-of-defense to make sure the URL_REGEX isn't -# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib) -# the consequences of bad URL parsing could be disastrous and lead to many -# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking -_test_url_strs = { - 'example.com': 0, - '/example.com': 0, - '//example.com': 0, - ':/example.com': 0, - '://example.com': 0, - 'htt://example8.com': 0, - '/htt://example.com': 0, - 'https://example': 1, - 'https://localhost/2345': 1, - 'https://localhost:1234/123': 1, - '://': 0, - 'https://': 0, - 'http://': 0, - 'ftp://': 0, - 'ftp://example.com': 0, - 'https://example.com': 1, - 'https://example.com/': 1, - 'https://a.example.com': 1, - 'https://a.example.com/': 1, - 'https://a.example.com/what/is/happening.html': 1, - 'https://a.example.com/what/ís/happening.html': 1, - 'https://a.example.com/what/is/happening.html?what=1&2%20b#hÃļw-about-this=1a': 1, - 'https://a.example.com/what/is/happÊning/?what=1&2%20b#how-aboÃŧt-this=1a': 1, - 'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1, - 'https://example.com/?what=1#how-about-this=1&2%20baf': 1, - 'https://example.com?what=1#how-about-this=1&2%20baf': 1, - 'http://example7.com': 1, - '[https://example8.com/what/is/this.php?what=1]': 1, - '[and http://example9.com?what=1&other=3#and-thing=2]': 1, - 'https://example10.com#and-thing=2 "': 1, - 'abcdef': 1, - 'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1, - 'http://examplehttp://15.badc': 2, - 'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2, - '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3, -} -for url_str, num_urls in _test_url_strs.items(): - assert len(re.findall(URL_REGEX, url_str)) == num_urls, ( - f'{url_str} does not contain {num_urls} urls') diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py deleted file mode 100644 index 95adb01853..0000000000 --- a/archivebox/parsers/generic_html.py +++ /dev/null @@ -1,58 +0,0 @@ -__package__ = 'archivebox.parsers' - - -import re - -from typing import IO, Iterable, Optional -from datetime import datetime, timezone - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, - URL_REGEX, -) -from html.parser import HTMLParser -from urllib.parse import urljoin - - -class HrefParser(HTMLParser): - def __init__(self): - super().__init__() - self.urls = [] - - def handle_starttag(self, tag, attrs): - if tag == "a": - for attr, value in attrs: - if attr == "href": - self.urls.append(value) - - -@enforce_types -def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]: - """Parse Generic HTML for href tags and use only the url (support for title coming later)""" - - html_file.seek(0) - for line in html_file: - parser = HrefParser() - # example line - #
  • example title
  • - parser.feed(line) - for url in parser.urls: - if root_url: - # resolve relative urls /home.html -> https://example.com/home.html - url = urljoin(root_url, url) - - for archivable_url in re.findall(URL_REGEX, url): - yield Link( - url=htmldecode(archivable_url), - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[html_file.name], - ) - - -KEY = 'html' -NAME = 'Generic HTML' -PARSER = parse_generic_html_export diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py deleted file mode 100644 index 0466b0f6b7..0000000000 --- a/archivebox/parsers/generic_json.py +++ /dev/null @@ -1,70 +0,0 @@ -__package__ = 'archivebox.parsers' - -import json - -from typing import IO, Iterable -from datetime import datetime, timezone - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, -) - - -@enforce_types -def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" - - json_file.seek(0) - links = json.load(json_file) - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] - if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now(timezone.utc).timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() - elif link.get('description'): - title = link['description'].replace(' — Readability', '').strip() - elif link.get('name'): - title = link['name'].strip() - - yield Link( - url=htmldecode(url), - timestamp=ts_str, - title=htmldecode(title) or None, - tags=htmldecode(link.get('tags')) or '', - sources=[json_file.name], - ) - - -KEY = 'json' -NAME = 'Generic JSON' -PARSER = parse_generic_json_export diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py deleted file mode 100644 index 4bd0496734..0000000000 --- a/archivebox/parsers/generic_rss.py +++ /dev/null @@ -1,54 +0,0 @@ -__package__ = 'archivebox.parsers' - - -from typing import IO, Iterable -from datetime import datetime - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, - str_between, -) - -@enforce_types -def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse RSS XML-format files into links""" - - rss_file.seek(0) - items = rss_file.read().split('') - items = items[1:] if items else [] - for item in items: - # example item: - # - # <![CDATA[How JavaScript works: inside the V8 engine]]> - # Unread - # https://blog.sessionstack.com/how-javascript-works-inside - # https://blog.sessionstack.com/how-javascript-works-inside - # Mon, 21 Aug 2017 14:21:58 -0500 - # - - trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1].strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '', '') - ts_str = str_between(get_row('pubDate'), '', '') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), ' Iterable[Link]: - """Parse links from a text file, ignoring other text""" - - text_file.seek(0) - for line in text_file.readlines(): - if not line.strip(): - continue - - # if the line is a local file path that resolves, then we can archive it - try: - if Path(line).exists(): - yield Link( - url=line, - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) - except (OSError, PermissionError): - # nvm, not a valid path... - pass - - # otherwise look for anything that looks like a URL in the line - for url in re.findall(URL_REGEX, line): - yield Link( - url=htmldecode(url), - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) - - # look inside the URL for any sub-urls, e.g. for archive.org links - # https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ - # -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/ - for sub_url in re.findall(URL_REGEX, line[1:]): - yield Link( - url=htmldecode(sub_url), - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) - -KEY = 'txt' -NAME = 'Generic TXT' -PARSER = parse_generic_txt_export diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py deleted file mode 100644 index a4159f286f..0000000000 --- a/archivebox/parsers/medium_rss.py +++ /dev/null @@ -1,40 +0,0 @@ -__package__ = 'archivebox.parsers' - - -from typing import IO, Iterable -from datetime import datetime - -from xml.etree import ElementTree - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, -) - - -@enforce_types -def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse Medium RSS feed files into links""" - - rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.find("channel").findall("item") # type: ignore - for item in items: - url = item.find("link").text # type: ignore - title = item.find("title").text.strip() # type: ignore - ts_str = item.find("pubDate").text # type: ignore - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[rss_file.name], - ) - - -KEY = 'medium_rss' -NAME = 'Medium RSS' -PARSER = parse_medium_rss_export diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py deleted file mode 100644 index 7523f100af..0000000000 --- a/archivebox/parsers/netscape_html.py +++ /dev/null @@ -1,43 +0,0 @@ -__package__ = 'archivebox.parsers' - - -import re - -from typing import IO, Iterable -from datetime import datetime - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, -) - - -@enforce_types -def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse netscape-format bookmarks export files (produced by all browsers)""" - - html_file.seek(0) - pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) - for line in html_file: - # example line - #
    example bookmark title - - match = pattern.search(line) - if match: - url = match.group(1) - time = datetime.fromtimestamp(float(match.group(2))) - title = match.group(3).strip() - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[html_file.name], - ) - - -KEY = 'netscape_html' -NAME = 'Netscape HTML' -PARSER = parse_netscape_html_export diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py deleted file mode 100644 index b7a77a00ee..0000000000 --- a/archivebox/parsers/pinboard_rss.py +++ /dev/null @@ -1,52 +0,0 @@ -__package__ = 'archivebox.parsers' - - -from typing import IO, Iterable -from datetime import datetime, timezone - -from xml.etree import ElementTree - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, -) - - -@enforce_types -def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse Pinboard RSS feed files into links""" - - rss_file.seek(0) - root = ElementTree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore - - url = find("{http://purl.org/rss/1.0/}link") - tags = find("{http://purl.org/dc/elements/1.1/}subject") - title = find("{http://purl.org/rss/1.0/}title") - ts_str = find("{http://purl.org/dc/elements/1.1/}date") - - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now(timezone.utc) - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=htmldecode(tags) or None, - sources=[rss_file.name], - ) - - -KEY = 'pinboard_rss' -NAME = 'Pinboard RSS' -PARSER = parse_pinboard_rss_export diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py deleted file mode 100644 index afad70ed90..0000000000 --- a/archivebox/parsers/pocket_api.py +++ /dev/null @@ -1,118 +0,0 @@ -__package__ = 'archivebox.parsers' - - -import re - -from typing import IO, Iterable, Optional -from configparser import ConfigParser - -from pathlib import Path -from ..vendor.pocket import Pocket - -from ..index.schema import Link -from ..util import enforce_types -from ..system import atomic_write -from ..config import ( - SOURCES_DIR, - POCKET_CONSUMER_KEY, - POCKET_ACCESS_TOKENS, -) - - -COUNT_PER_PAGE = 500 -API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db' - -# search for broken protocols that sometimes come from the Pocket API -_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))') - - -def get_pocket_articles(api: Pocket, since=None, page=0): - body, headers = api.get( - state='archive', - sort='oldest', - since=since, - count=COUNT_PER_PAGE, - offset=page * COUNT_PER_PAGE, - ) - - articles = body['list'].values() if isinstance(body['list'], dict) else body['list'] - returned_count = len(articles) - - yield from articles - - if returned_count == COUNT_PER_PAGE: - yield from get_pocket_articles(api, since=since, page=page + 1) - else: - api.last_since = body['since'] - - -def link_from_article(article: dict, sources: list): - url: str = article['resolved_url'] or article['given_url'] - broken_protocol = _BROKEN_PROTOCOL_RE.match(url) - if broken_protocol: - url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://') - title = article['resolved_title'] or article['given_title'] or url - - return Link( - url=url, - timestamp=article['time_read'], - title=title, - tags=article.get('tags'), - sources=sources - ) - - -def write_since(username: str, since: str): - if not API_DB_PATH.exists(): - atomic_write(API_DB_PATH, '') - - since_file = ConfigParser() - since_file.optionxform = str - since_file.read(API_DB_PATH) - - since_file[username] = { - 'since': since - } - - with open(API_DB_PATH, 'w+') as new: - since_file.write(new) - - -def read_since(username: str) -> Optional[str]: - if not API_DB_PATH.exists(): - atomic_write(API_DB_PATH, '') - - config_file = ConfigParser() - config_file.optionxform = str - config_file.read(API_DB_PATH) - - return config_file.get(username, 'since', fallback=None) - - -@enforce_types -def should_parse_as_pocket_api(text: str) -> bool: - return text.startswith('pocket://') - - -@enforce_types -def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: - """Parse bookmarks from the Pocket API""" - - input_buffer.seek(0) - pattern = re.compile(r"^pocket:\/\/(\w+)") - for line in input_buffer: - if should_parse_as_pocket_api(line): - - username = pattern.search(line).group(1) - api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username]) - api.last_since = None - - for article in get_pocket_articles(api, since=read_since(username)): - yield link_from_article(article, sources=[line]) - - write_since(username, api.last_since) - - -KEY = 'pocket_api' -NAME = 'Pocket API' -PARSER = parse_pocket_api_export diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py deleted file mode 100644 index d34c8bad77..0000000000 --- a/archivebox/parsers/pocket_html.py +++ /dev/null @@ -1,43 +0,0 @@ -__package__ = 'archivebox.parsers' - - -import re - -from typing import IO, Iterable -from datetime import datetime - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, -) - - -@enforce_types -def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" - - html_file.seek(0) - pattern = re.compile("^\\s*
  • (.+)
  • ", re.UNICODE) - for line in html_file: - # example line - #
  • example title
  • - match = pattern.search(line) - if match: - url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url - time = datetime.fromtimestamp(float(match.group(2))) - tags = match.group(3) - title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=tags or '', - sources=[html_file.name], - ) - - -KEY = 'pocket_html' -NAME = 'Pocket HTML' -PARSER = parse_pocket_html_export diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py deleted file mode 100644 index 6793489908..0000000000 --- a/archivebox/parsers/shaarli_rss.py +++ /dev/null @@ -1,55 +0,0 @@ -__package__ = 'archivebox.parsers' - - -from typing import IO, Iterable -from datetime import datetime - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, - str_between, -) - - -@enforce_types -def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse Shaarli-specific RSS XML-format files into links""" - - rss_file.seek(0) - entries = rss_file.read().split('')[1:] - for entry in entries: - # example entry: - # - # Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online - # - # https://demo.shaarli.org/?cEV4vw - # 2019-01-30T06:06:01+00:00 - # 2019-01-30T06:06:01+00:00 - #

    Permalink

    ]]> - # - - trailing_removed = entry.split('', 1)[0] - leading_removed = trailing_removed.strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] - - title = str_between(get_row('title'), '', '').strip() - url = str_between(get_row('link'), '') - ts_str = str_between(get_row('published'), '', '') - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=None, - sources=[rss_file.name], - ) - - -KEY = 'shaarli_rss' -NAME = 'Shaarli RSS' -PARSER = parse_shaarli_rss_export diff --git a/archivebox/parsers/url_list.py b/archivebox/parsers/url_list.py deleted file mode 100644 index e9a7bbb376..0000000000 --- a/archivebox/parsers/url_list.py +++ /dev/null @@ -1,37 +0,0 @@ -__package__ = 'archivebox.parsers' -__description__ = 'URL list' - -import re - -from typing import IO, Iterable -from datetime import datetime, timezone - -from ..index.schema import Link -from ..util import ( - enforce_types, - URL_REGEX, -) - - -@enforce_types -def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse raw URLs from each line in a text file""" - - text_file.seek(0) - for line in text_file.readlines(): - url = line.strip() - if (not url) or not re.findall(URL_REGEX, url): - continue - - yield Link( - url=url, - timestamp=str(datetime.now(timezone.utc).timestamp()), - title=None, - tags=None, - sources=[text_file.name], - ) - - -KEY = 'url_list' -NAME = 'URL List' -PARSER = parse_url_list diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py deleted file mode 100644 index 32740097ad..0000000000 --- a/archivebox/parsers/wallabag_atom.py +++ /dev/null @@ -1,62 +0,0 @@ -__package__ = 'archivebox.parsers' - - -from typing import IO, Iterable -from datetime import datetime - -from ..index.schema import Link -from ..util import ( - htmldecode, - enforce_types, - str_between, -) - - -@enforce_types -def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: - """Parse Wallabag Atom files into links""" - - rss_file.seek(0) - entries = rss_file.read().split('')[1:] - for entry in entries: - # example entry: - # - # <![CDATA[Orient Ray vs Mako: Is There Much Difference? - iknowwatches.com]]> - # - # https://iknowwatches.com/orient-ray-vs-mako/ - # wallabag:wallabag.drycat.fr:milosh:entry:14041 - # 2020-10-18T09:14:02+02:00 - # 2020-10-18T09:13:56+02:00 - # - # - # - - trailing_removed = entry.split('', 1)[0] - leading_removed = trailing_removed.strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] - - title = str_between(get_row('title'), '<![CDATA[', ']]>').strip() - url = str_between(get_row('link rel="via"'), '', '') - ts_str = str_between(get_row('published'), '', '') - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - try: - tags = str_between(get_row('category'), 'label="', '" />') - except Exception: - tags = None - - yield Link( - url=htmldecode(url), - timestamp=str(time.timestamp()), - title=htmldecode(title) or None, - tags=tags or '', - sources=[rss_file.name], - ) - - -KEY = 'wallabag_atom' -NAME = 'Wallabag Atom' -PARSER = parse_wallabag_atom_export diff --git a/archivebox/personas/__init__.py b/archivebox/personas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py new file mode 100644 index 0000000000..cdf7df7fe4 --- /dev/null +++ b/archivebox/personas/admin.py @@ -0,0 +1,184 @@ +__package__ = "archivebox.personas" + +import shutil + +from django.contrib import admin, messages +from django.utils.html import format_html, format_html_join + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.personas.forms import PersonaAdminForm +from archivebox.personas.importers import discover_local_browser_profiles +from archivebox.personas.models import Persona + + +class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin): + form = PersonaAdminForm + change_form_template = "admin/personas/persona/change_form.html" + + list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state") + search_fields = ("name", "created_by__username") + list_filter = ("created_by",) + ordering = ["name"] + list_per_page = 100 + readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status") + + add_fieldsets = ( + ( + "Persona", + { + "fields": ("name", "created_by"), + "classes": ("card",), + }, + ), + ( + "Browser Import", + { + "fields": ( + "import_mode", + "import_discovered_profile", + "import_source", + "import_profile_name", + "import_copy_profile", + "import_extract_cookies", + "import_capture_storage", + ), + "classes": ("card", "wide"), + }, + ), + ( + "Advanced", + { + "fields": ("config",), + "classes": ("card", "wide"), + }, + ), + ) + + change_fieldsets = add_fieldsets + ( + ( + "Artifacts", + { + "fields": ("persona_paths", "import_artifact_status"), + "classes": ("card", "wide"), + }, + ), + ( + "Timestamps", + { + "fields": ("id", "created_at"), + "classes": ("card",), + }, + ), + ) + + @admin.display(description="Chrome Profile") + def chrome_profile_state(self, obj: Persona) -> str: + return "yes" if (obj.path / "chrome_user_data").exists() else "no" + + @admin.display(description="cookies.txt") + def cookies_state(self, obj: Persona) -> str: + return "yes" if obj.COOKIES_FILE else "no" + + @admin.display(description="auth.json") + def auth_state(self, obj: Persona) -> str: + return "yes" if obj.AUTH_STORAGE_FILE else "no" + + @admin.display(description="Persona Paths") + def persona_paths(self, obj: Persona) -> str: + return format_html( + "
    " + "
    Persona root{}
    " + "
    chrome_user_data{}
    " + "
    chrome_extensions{}
    " + "
    chrome_downloads{}
    " + "
    cookies.txt{}
    " + "
    auth.json{}
    " + "
    ", + obj.path, + obj.CHROME_USER_DATA_DIR, + obj.CHROME_EXTENSIONS_DIR, + obj.CHROME_DOWNLOADS_DIR, + obj.COOKIES_FILE or (obj.path / "cookies.txt"), + obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"), + ) + + @admin.display(description="Import Artifacts") + def import_artifact_status(self, obj: Persona) -> str: + entries = [ + ("Browser profile", (obj.path / "chrome_user_data").exists(), obj.CHROME_USER_DATA_DIR), + ("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")), + ("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")), + ] + return format_html( + "
    {}
    ", + format_html_join( + "", + "
    {}{}{}
    ", + ( + ( + label, + "abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no", + "present" if enabled else "missing", + path, + ) + for label, enabled, path in entries + ), + ), + ) + + def get_fieldsets(self, request, obj=None): + return self.change_fieldsets if obj else self.add_fieldsets + + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + context["detected_profile_count"] = len(discover_local_browser_profiles()) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def save_model(self, request, obj, form, change): + old_path = None + new_path = None + if change: + previous = Persona.objects.get(pk=obj.pk) + if previous.name != obj.name: + old_path = previous.path + new_path = obj.path + + super().save_model(request, obj, form, change) + + if old_path and new_path and old_path != new_path and old_path.exists(): + if new_path.exists(): + raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}") + shutil.move(str(old_path), str(new_path)) + + obj.ensure_dirs() + + import_result = form.apply_import(obj) + if import_result is None: + return + + completed_actions = [] + if import_result.profile_copied: + completed_actions.append("profile copied") + if import_result.cookies_imported: + completed_actions.append("cookies.txt generated") + if import_result.storage_captured: + completed_actions.append("auth.json captured") + if import_result.user_agent_imported: + completed_actions.append("USER_AGENT copied") + + if completed_actions: + messages.success( + request, + f"Imported {', '.join(completed_actions)} from {import_result.source.display_label}.", + ) + else: + messages.warning( + request, + f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.", + ) + + for warning in import_result.warnings: + messages.warning(request, warning) + + +def register_admin(admin_site: admin.AdminSite) -> None: + admin_site.register(Persona, PersonaAdmin) diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py new file mode 100644 index 0000000000..df45c2668a --- /dev/null +++ b/archivebox/personas/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class PersonasConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.personas" + label = "personas" diff --git a/archivebox/personas/export_browser_state.js b/archivebox/personas/export_browser_state.js new file mode 100644 index 0000000000..77b394f95d --- /dev/null +++ b/archivebox/personas/export_browser_state.js @@ -0,0 +1,210 @@ +#!/usr/bin/env node +/** + * Export cookies and open-tab storage from a Chromium profile or live CDP URL. + * + * Environment variables: + * ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins + * CHROME_USER_DATA_DIR Local Chromium user-data directory to launch + * CHROME_CDP_URL Existing browser CDP URL to attach to + * COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt + * AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json + * CHROME_BINARY Optional browser binary override + * NODE_MODULES_DIR Optional node_modules path for puppeteer-core + */ + +const fs = require('fs'); +const os = require('os'); +const path = require('path'); + +const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR; +if (!pluginsDir) { + console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required'); + process.exit(1); +} + +const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js')); +baseUtils.ensureNodeModuleResolution(module); + +const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js')); +const puppeteer = require('puppeteer-core'); + +function cookieToNetscape(cookie) { + let domain = cookie.domain; + if (!domain.startsWith('.') && !cookie.hostOnly) { + domain = '.' + domain; + } + + const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE'; + const cookiePath = cookie.path || '/'; + const secure = cookie.secure ? 'TRUE' : 'FALSE'; + const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0'; + + return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`; +} + +function writeCookiesFile(cookies, outputPath) { + const lines = [ + '# Netscape HTTP Cookie File', + '# https://curl.se/docs/http-cookies.html', + '# This file was generated by ArchiveBox persona cookie extraction', + '#', + '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', + '', + ]; + + for (const cookie of cookies) { + lines.push(cookieToNetscape(cookie)); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + fs.writeFileSync(outputPath, lines.join('\n') + '\n'); +} + +async function collectStorage(browser) { + const localStorage = {}; + const sessionStorage = {}; + const pages = await browser.pages(); + + for (const page of pages) { + try { + const url = page.url(); + if (!url || url === 'about:blank') continue; + if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue; + + const payload = await page.evaluate(() => ({ + origin: window.location.origin, + localStorage: Object.fromEntries(Object.entries(window.localStorage)), + sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)), + })); + + if (!payload.origin || payload.origin === 'null') continue; + if (Object.keys(payload.localStorage || {}).length > 0) { + localStorage[payload.origin] = payload.localStorage; + } + if (Object.keys(payload.sessionStorage || {}).length > 0) { + sessionStorage[payload.origin] = payload.sessionStorage; + } + } catch (error) { + // Ignore pages that cannot be inspected via evaluate(). + } + } + + return { localStorage, sessionStorage }; +} + +async function openBrowser() { + const cdpUrl = process.env.CHROME_CDP_URL || ''; + if (cdpUrl) { + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null }); + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + }, + sourceDescription: cdpUrl, + }; + } + + const userDataDir = process.env.CHROME_USER_DATA_DIR; + if (!userDataDir) { + throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required'); + } + if (!fs.existsSync(userDataDir)) { + throw new Error(`User data directory does not exist: ${userDataDir}`); + } + + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-')); + const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary(); + if (!binary) { + throw new Error('Could not find a Chromium binary for browser state export'); + } + + const launched = await chromeUtils.launchChromium({ + binary, + outputDir, + userDataDir, + headless: true, + killZombies: false, + }); + + if (!launched.success) { + throw new Error(launched.error || 'Chrome launch failed'); + } + + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null }); + + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + try { + await chromeUtils.killChrome(launched.pid, outputDir); + } catch (error) {} + try { + fs.rmSync(outputDir, { recursive: true, force: true }); + } catch (error) {} + }, + sourceDescription: userDataDir, + }; +} + +async function main() { + const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || ''; + const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || ''; + if (!cookiesOutput && !authOutput) { + throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required'); + } + + const { browser, cleanup, sourceDescription } = await openBrowser(); + + try { + const session = await browser.target().createCDPSession(); + const browserVersion = await session.send('Browser.getVersion'); + const cookieResult = await session.send('Storage.getCookies'); + const cookies = cookieResult?.cookies || []; + const { localStorage, sessionStorage } = await collectStorage(browser); + const userAgent = browserVersion?.userAgent || ''; + + if (cookiesOutput) { + writeCookiesFile(cookies, cookiesOutput); + } + + if (authOutput) { + fs.mkdirSync(path.dirname(authOutput), { recursive: true }); + fs.writeFileSync( + authOutput, + JSON.stringify( + { + TYPE: 'auth', + SOURCE: sourceDescription, + captured_at: new Date().toISOString(), + user_agent: userAgent, + cookies, + localStorage, + sessionStorage, + }, + null, + 2, + ) + '\n', + ); + } + + console.error( + `[+] Exported ${cookies.length} cookies` + + `${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` + + `${userAgent ? ' with browser USER_AGENT' : ''}` + + ` from ${sourceDescription}`, + ); + } finally { + await cleanup(); + } +} + +main().catch((error) => { + console.error(`ERROR: ${error.message}`); + process.exit(1); +}); diff --git a/archivebox/personas/forms.py b/archivebox/personas/forms.py new file mode 100644 index 0000000000..3781a0ecc1 --- /dev/null +++ b/archivebox/personas/forms.py @@ -0,0 +1,182 @@ +__package__ = "archivebox.personas" + +from typing import Any + +from django import forms +from django.utils.safestring import mark_safe + +from archivebox.personas.importers import ( + PersonaImportResult, + PersonaImportSource, + discover_local_browser_profiles, + import_persona_from_source, + resolve_custom_import_source, + validate_persona_name, +) +from archivebox.personas.models import Persona + + +def _mode_label(title: str, description: str) -> str: + return mark_safe( + f'{title}{description}', + ) + + +class PersonaAdminForm(forms.ModelForm): + import_mode = forms.ChoiceField( + required=False, + initial="none", + label="Bootstrap this persona", + widget=forms.RadioSelect, + choices=( + ("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")), + ("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")), + ( + "custom", + _mode_label( + "Use a custom path or CDP URL", + "Paste an absolute Chromium path or attach to a live browser debugging endpoint.", + ), + ), + ), + help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.", + ) + import_discovered_profile = forms.ChoiceField( + required=False, + label="Autodiscovered profiles", + widget=forms.RadioSelect, + choices=(), + help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.", + ) + import_source = forms.CharField( + required=False, + label="Absolute path or CDP URL", + widget=forms.TextInput( + attrs={ + "placeholder": "/Users/alice/Library/Application Support/Google/Chrome or ws://127.0.0.1:9222/devtools/browser/...", + "style": "width: 100%; font-family: monospace;", + }, + ), + help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.", + ) + import_profile_name = forms.CharField( + required=False, + label="Profile directory name", + widget=forms.TextInput( + attrs={ + "placeholder": "Default or Profile 1", + "style": "width: 100%; font-family: monospace;", + }, + ), + help_text="Only used when the custom path points at a browser root containing multiple profiles.", + ) + import_copy_profile = forms.BooleanField( + required=False, + initial=True, + label="Copy browser profile into this persona", + help_text="Copies the chosen Chromium user-data tree into `chrome_user_data` for future archiving runs.", + ) + import_extract_cookies = forms.BooleanField( + required=False, + initial=True, + label="Generate `cookies.txt`", + help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.", + ) + import_capture_storage = forms.BooleanField( + required=False, + initial=True, + label="Capture open-tab storage into `auth.json`", + help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.", + ) + + class Meta: + model = Persona + fields = ("name", "created_by", "config") + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.discovered_profiles = discover_local_browser_profiles() + self._resolved_import_source: PersonaImportSource | None = None + + self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode" + self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker" + + if self.discovered_profiles: + self.fields["import_discovered_profile"].choices = [ + (profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles + ] + else: + self.fields["import_discovered_profile"].choices = [] + self.fields["import_discovered_profile"].help_text = ( + "No local Chromium profiles were detected on this host right now. " + "Use the custom path/CDP option if the browser data lives elsewhere." + ) + + def clean_name(self) -> str: + name = str(self.cleaned_data.get("name") or "").strip() + is_valid, error_message = validate_persona_name(name) + if not is_valid: + raise forms.ValidationError(error_message) + return name + + def clean(self) -> dict[str, Any]: + cleaned_data = super().clean() + self._resolved_import_source = None + + import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none" + if import_mode == "none": + return cleaned_data + + if import_mode == "discovered": + selection = str(cleaned_data.get("import_discovered_profile") or "").strip() + if not selection: + self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.") + return cleaned_data + try: + self._resolved_import_source = PersonaImportSource.from_choice_value(selection) + except ValueError as err: + self.add_error("import_discovered_profile", str(err)) + return cleaned_data + elif import_mode == "custom": + raw_value = str(cleaned_data.get("import_source") or "").strip() + if not raw_value: + self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.") + return cleaned_data + try: + self._resolved_import_source = resolve_custom_import_source( + raw_value, + profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None, + ) + except ValueError as err: + self.add_error("import_source", str(err)) + return cleaned_data + else: + self.add_error("import_mode", "Choose how this Persona should be bootstrapped.") + return cleaned_data + + copy_profile = bool(cleaned_data.get("import_copy_profile")) + import_cookies = bool(cleaned_data.get("import_extract_cookies")) + capture_storage = bool(cleaned_data.get("import_capture_storage")) + + if self._resolved_import_source.kind == "cdp": + if not (import_cookies or capture_storage): + self.add_error( + "import_extract_cookies", + "CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.", + ) + elif not (copy_profile or import_cookies or capture_storage): + raise forms.ValidationError("Select at least one import action.") + + return cleaned_data + + def apply_import(self, persona: Persona) -> PersonaImportResult | None: + if not self._resolved_import_source: + return None + + return import_persona_from_source( + persona, + self._resolved_import_source, + copy_profile=bool(self.cleaned_data.get("import_copy_profile")), + import_cookies=bool(self.cleaned_data.get("import_extract_cookies")), + capture_storage=bool(self.cleaned_data.get("import_capture_storage")), + ) diff --git a/archivebox/personas/importers.py b/archivebox/personas/importers.py new file mode 100644 index 0000000000..ea63790fa7 --- /dev/null +++ b/archivebox/personas/importers.py @@ -0,0 +1,855 @@ +""" +Shared persona browser discovery/import helpers. + +These helpers are used by both the CLI and the Django admin so Persona import +behavior stays consistent regardless of where it is triggered from. +""" + +from __future__ import annotations + +import json +import os +import platform +import shutil +import subprocess +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING +from urllib.parse import urlparse + +from django.utils.html import format_html +from django.utils.safestring import SafeString + +if TYPE_CHECKING: + from archivebox.personas.models import Persona + + +BROWSER_LABELS = { + "chrome": "Google Chrome", + "chromium": "Chromium", + "brave": "Brave", + "edge": "Microsoft Edge", + "custom": "Custom Path", + "persona": "Persona Template", +} + +BROWSER_PROFILE_DIR_NAMES = ( + "Default", + "Profile ", + "Guest Profile", +) + +VOLATILE_PROFILE_COPY_PATTERNS = ( + "Cache", + "Code Cache", + "GPUCache", + "ShaderCache", + "Service Worker", + "GCM Store", + "*.log", + "Crashpad", + "BrowserMetrics", + "BrowserMetrics-spare.pma", + "SingletonLock", + "SingletonSocket", + "SingletonCookie", +) + +PERSONA_PROFILE_DIR_CANDIDATES = ( + "chrome_profile", + "chrome_user_data", +) + + +@dataclass(frozen=True) +class PersonaImportSource: + kind: str + browser: str = "custom" + source_name: str | None = None + user_data_dir: Path | None = None + profile_dir: str | None = None + browser_binary: str | None = None + cdp_url: str | None = None + + @property + def browser_label(self) -> str: + return BROWSER_LABELS.get(self.browser, self.browser.title()) + + @property + def profile_path(self) -> Path | None: + if not self.user_data_dir or not self.profile_dir: + return None + return self.user_data_dir / self.profile_dir + + @property + def display_label(self) -> str: + if self.kind == "cdp": + return self.cdp_url or "CDP URL" + profile_suffix = f" / {self.profile_dir}" if self.profile_dir else "" + source_prefix = f": {self.source_name}" if self.source_name else "" + return f"{self.browser_label}{source_prefix}{profile_suffix}" + + @property + def choice_value(self) -> str: + return json.dumps( + { + "kind": self.kind, + "browser": self.browser, + "source_name": self.source_name or "", + "user_data_dir": str(self.user_data_dir) if self.user_data_dir else "", + "profile_dir": self.profile_dir or "", + "browser_binary": self.browser_binary or "", + "cdp_url": self.cdp_url or "", + }, + sort_keys=True, + ) + + def as_choice_label(self) -> SafeString: + path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "") + binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary" + return format_html( + '{}{}{}', + self.display_label, + binary_suffix, + path_str, + ) + + @classmethod + def from_choice_value(cls, value: str) -> PersonaImportSource: + try: + payload = json.loads(value) + except json.JSONDecodeError as err: + raise ValueError("Invalid discovered profile selection.") from err + + if payload.get("kind") != "browser-profile": + raise ValueError("Invalid discovered profile selection.") + + user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser() + profile_dir = str(payload.get("profile_dir") or "").strip() + browser = str(payload.get("browser") or "custom").strip().lower() or "custom" + source_name = str(payload.get("source_name") or "").strip() or None + browser_binary = str(payload.get("browser_binary") or "").strip() or None + + return resolve_browser_profile_source( + browser=browser, + source_name=source_name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +@dataclass +class PersonaImportResult: + source: PersonaImportSource + profile_copied: bool = False + cookies_imported: bool = False + storage_captured: bool = False + user_agent_imported: bool = False + warnings: list[str] = field(default_factory=list) + + @property + def did_work(self) -> bool: + return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported + + +def get_chrome_user_data_dir() -> Path | None: + """Get the default Chrome user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_brave_user_data_dir() -> Path | None: + """Get the default Brave user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_edge_user_data_dir() -> Path | None: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Microsoft Edge", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Microsoft" / "Edge" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_browser_binary(browser: str) -> str | None: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == "Darwin": + candidates = { + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], + }.get(browser, []) + elif system == "Linux": + candidates = { + "chrome": [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + ], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": [ + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ], + }.get(browser, []) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = { + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + ], + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + ], + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + +BROWSER_PROFILE_FINDERS = { + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, +} + +CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys()) + + +NETSCAPE_COOKIE_HEADER = [ + "# Netscape HTTP Cookie File", + "# https://curl.se/docs/http-cookies.html", + "# This file was generated by ArchiveBox persona cookie extraction", + "#", + "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue", + "", +] + + +def validate_persona_name(name: str) -> tuple[bool, str]: + """Validate persona name to prevent path traversal.""" + if not name or not name.strip(): + return False, "Persona name cannot be empty" + if "/" in name or "\\" in name: + return False, "Persona name cannot contain path separators (/ or \\)" + if ".." in name: + return False, "Persona name cannot contain parent directory references (..)" + if name.startswith("."): + return False, "Persona name cannot start with a dot (.)" + if "\x00" in name or "\n" in name or "\r" in name: + return False, "Persona name contains invalid characters" + return True, "" + + +def discover_local_browser_profiles() -> list[PersonaImportSource]: + discovered: list[PersonaImportSource] = [] + + for browser, finder in BROWSER_PROFILE_FINDERS.items(): + user_data_dir = finder() + if not user_data_dir: + continue + + browser_binary = get_browser_binary(browser) + for profile_dir in _list_profile_names(user_data_dir): + try: + discovered.append( + resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ), + ) + except ValueError: + continue + + discovered.extend(discover_persona_template_profiles()) + + return discovered + + +def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]: + from archivebox.config.constants import CONSTANTS + + templates: list[PersonaImportSource] = [] + candidate_roots: list[Path] = [] + + if personas_dir is not None: + candidate_roots.append(personas_dir.expanduser()) + else: + candidate_roots.extend( + [ + CONSTANTS.PERSONAS_DIR.expanduser(), + Path.home() / ".config" / "abx" / "personas", + ], + ) + + seen_roots: set[Path] = set() + for personas_root in candidate_roots: + resolved_root = personas_root.resolve() + if resolved_root in seen_roots: + continue + seen_roots.add(resolved_root) + + if not resolved_root.exists() or not resolved_root.is_dir(): + continue + + for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()): + for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES: + user_data_dir = persona_dir / candidate_dir_name + if not user_data_dir.exists() or not user_data_dir.is_dir(): + continue + + for profile_dir in _list_profile_names(user_data_dir): + try: + templates.append( + resolve_browser_profile_source( + browser="persona", + source_name=persona_dir.name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=get_browser_binary("chrome"), + ), + ) + except ValueError: + continue + + return templates + + +def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource: + browser = browser.lower().strip() + if browser not in BROWSER_PROFILE_FINDERS: + supported = ", ".join(BROWSER_PROFILE_FINDERS) + raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}") + + user_data_dir = BROWSER_PROFILE_FINDERS[browser]() + if not user_data_dir: + raise ValueError(f"Could not find {browser} profile directory") + + chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir) + if not chosen_profile: + raise ValueError(f"Could not find a profile in {user_data_dir}") + + return resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=chosen_profile, + browser_binary=get_browser_binary(browser), + ) + + +def resolve_browser_profile_source( + browser: str, + user_data_dir: Path, + profile_dir: str, + source_name: str | None = None, + browser_binary: str | None = None, +) -> PersonaImportSource: + resolved_root = user_data_dir.expanduser() + if not resolved_root.is_absolute(): + resolved_root = resolved_root.resolve() + if not resolved_root.exists(): + raise ValueError(f"Profile root does not exist: {resolved_root}") + if not profile_dir.strip(): + raise ValueError("Profile directory name cannot be empty.") + + profile_path = resolved_root / profile_dir + if not _looks_like_profile_dir(profile_path): + raise ValueError(f"Profile directory does not look valid: {profile_path}") + + return PersonaImportSource( + kind="browser-profile", + browser=browser, + source_name=source_name, + user_data_dir=resolved_root, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource: + raw_value = raw_value.strip() + if not raw_value: + raise ValueError("Provide an absolute browser profile path or a CDP URL.") + + if _looks_like_cdp_url(raw_value): + return PersonaImportSource(kind="cdp", cdp_url=raw_value) + + source_path = Path(raw_value).expanduser() + if not source_path.is_absolute(): + raise ValueError("Custom browser path must be an absolute path.") + if not source_path.exists(): + raise ValueError(f"Custom browser path does not exist: {source_path}") + + explicit_profile = profile_dir.strip() if profile_dir else "" + if _looks_like_profile_dir(source_path): + if explicit_profile and explicit_profile != source_path.name: + raise ValueError("Profile name does not match the provided profile directory path.") + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.parent.resolve(), + profile_dir=source_path.name, + ) + + chosen_profile = explicit_profile or pick_default_profile_dir(source_path) + if not chosen_profile: + raise ValueError( + "Could not find a Chromium profile in that directory. " + "Provide an exact profile directory path or fill in the profile name field.", + ) + + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.resolve(), + profile_dir=chosen_profile, + ) + + +def pick_default_profile_dir(user_data_dir: Path) -> str | None: + profiles = _list_profile_names(user_data_dir) + if not profiles: + return None + if "Default" in profiles: + return "Default" + return profiles[0] + + +def import_persona_from_source( + persona: Persona, + source: PersonaImportSource, + *, + copy_profile: bool = True, + import_cookies: bool = True, + capture_storage: bool = False, +) -> PersonaImportResult: + persona.ensure_dirs() + result = PersonaImportResult(source=source) + + persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR) + cookies_file = persona.path / "cookies.txt" + auth_file = persona.path / "auth.json" + + launch_user_data_dir: Path | None = None + + if source.kind == "browser-profile": + if copy_profile and source.user_data_dir: + resolved_source_root = source.user_data_dir.resolve() + resolved_persona_root = persona_chrome_dir.resolve() + if resolved_source_root == resolved_persona_root: + result.warnings.append( + "Skipped profile copy because the selected source is already this persona's chrome_user_data directory.", + ) + else: + copy_browser_user_data_dir(resolved_source_root, resolved_persona_root) + persona.cleanup_chrome_profile(resolved_persona_root) + result.profile_copied = True + launch_user_data_dir = resolved_persona_root + else: + launch_user_data_dir = source.user_data_dir + elif copy_profile: + result.warnings.append( + "Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.", + ) + + if source.kind == "cdp": + export_success, auth_payload, export_message = export_browser_state( + cdp_url=source.cdp_url, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + else: + export_success, auth_payload, export_message = export_browser_state( + user_data_dir=launch_user_data_dir, + profile_dir=source.profile_dir, + chrome_binary=source.browser_binary, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + + if not export_success: + result.warnings.append(export_message or "Browser import failed.") + return result + + if import_cookies and cookies_file.exists(): + result.cookies_imported = True + if capture_storage and auth_file.exists(): + result.storage_captured = True + if _apply_imported_user_agent(persona, auth_payload): + result.user_agent_imported = True + + return result + + +def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + shutil.copytree( + source_dir, + destination_dir, + symlinks=True, + ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS), + ) + + +def export_browser_state( + *, + user_data_dir: Path | None = None, + cdp_url: str | None = None, + profile_dir: str | None = None, + chrome_binary: str | None = None, + cookies_output_file: Path | None = None, + auth_output_file: Path | None = None, +) -> tuple[bool, dict | None, str]: + if not user_data_dir and not cdp_url: + return False, None, "Missing browser source." + + from abx_plugins import get_plugins_dir + from archivebox.config.common import STORAGE_CONFIG + + state_script = Path(__file__).with_name("export_browser_state.js") + if not state_script.exists(): + return False, None, f"Browser state export script not found at {state_script}" + + node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" + chrome_plugin_dir = Path(get_plugins_dir()).resolve() + + env = os.environ.copy() + env["NODE_MODULES_DIR"] = str(node_modules_dir) + env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir) + + if user_data_dir: + env["CHROME_USER_DATA_DIR"] = str(user_data_dir) + if cdp_url: + env["CHROME_CDP_URL"] = cdp_url + env["CHROME_IS_LOCAL"] = "false" + if chrome_binary: + env["CHROME_BINARY"] = str(chrome_binary) + if profile_dir: + extra_arg = f"--profile-directory={profile_dir}" + existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip() + args_list: list[str] = [] + if existing_extra: + if existing_extra.startswith("["): + try: + parsed = json.loads(existing_extra) + if isinstance(parsed, list): + args_list.extend(str(x) for x in parsed) + except Exception: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + else: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + args_list.append(extra_arg) + env["CHROME_ARGS_EXTRA"] = json.dumps(args_list) + + temp_dir: Path | None = None + tmp_cookies_file: Path | None = None + tmp_auth_file: Path | None = None + + if cookies_output_file and cookies_output_file.exists(): + temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_cookies_file = temp_dir / "cookies.txt" + env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file) + elif cookies_output_file: + env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file) + + if auth_output_file and auth_output_file.exists(): + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + elif auth_output_file: + env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file) + else: + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + + try: + result = subprocess.run( + ["node", str(state_script)], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + except subprocess.TimeoutExpired: + return False, None, "Browser state export timed out." + except FileNotFoundError: + return False, None, "Node.js was not found, so ArchiveBox could not extract browser state." + except Exception as err: + return False, None, f"Browser state export failed: {err}" + + if result.returncode != 0: + message = (result.stderr or result.stdout or "").strip() or "Browser state export failed." + return False, None, message + + auth_payload: dict | None = None + if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists(): + _merge_netscape_cookies(cookies_output_file, tmp_cookies_file) + if auth_output_file and tmp_auth_file and tmp_auth_file.exists(): + _merge_auth_storage(auth_output_file, tmp_auth_file) + auth_payload = _load_auth_storage(tmp_auth_file) + elif auth_output_file and auth_output_file.exists(): + auth_payload = _load_auth_storage(auth_output_file) + elif tmp_auth_file and tmp_auth_file.exists(): + auth_payload = _load_auth_storage(tmp_auth_file) + + if temp_dir and temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) + + return True, auth_payload, (result.stderr or result.stdout or "").strip() + + +def _list_profile_names(user_data_dir: Path) -> list[str]: + if not user_data_dir.exists() or not user_data_dir.is_dir(): + return [] + + profiles: list[str] = [] + for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()): + if not child.is_dir(): + continue + if child.name == "System Profile": + continue + if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"): + if _looks_like_profile_dir(child): + profiles.append(child.name) + continue + if _looks_like_profile_dir(child): + profiles.append(child.name) + return profiles + + +def _looks_like_profile_dir(path: Path) -> bool: + if not path.exists() or not path.is_dir(): + return False + + marker_paths = ( + path / "Preferences", + path / "History", + path / "Cookies", + path / "Network" / "Cookies", + path / "Local Storage", + path / "Session Storage", + ) + + if any(marker.exists() for marker in marker_paths): + return True + + return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES) + + +def _looks_like_cdp_url(value: str) -> bool: + parsed = urlparse(value) + return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc) + + +def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]: + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {} + if not path.exists(): + return cookies + + for line in path.read_text().splitlines(): + if not line or line.startswith("#"): + continue + parts = line.split("\t") + if len(parts) < 7: + continue + domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] + cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value) + return cookies + + +def _write_netscape_cookies( + path: Path, + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]], +) -> None: + lines = list(NETSCAPE_COOKIE_HEADER) + for cookie in cookies.values(): + lines.append("\t".join(cookie)) + path.write_text("\n".join(lines) + "\n") + + +def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: + existing = _parse_netscape_cookies(existing_file) + new = _parse_netscape_cookies(new_file) + existing.update(new) + _write_netscape_cookies(existing_file, existing) + + +def _merge_auth_storage(existing_file: Path, new_file: Path) -> None: + existing_payload = _load_auth_storage(existing_file) + new_payload = _load_auth_storage(new_file) + + existing_local = existing_payload.setdefault("localStorage", {}) + existing_session = existing_payload.setdefault("sessionStorage", {}) + + for origin, payload in (new_payload.get("localStorage") or {}).items(): + existing_local[origin] = payload + for origin, payload in (new_payload.get("sessionStorage") or {}).items(): + existing_session[origin] = payload + + cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or []) + + merged = { + **existing_payload, + **new_payload, + "cookies": cookies, + "localStorage": existing_local, + "sessionStorage": existing_session, + "user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "", + } + existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n") + + +def _load_auth_storage(path: Path) -> dict: + if not path.exists(): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + try: + payload = json.loads(path.read_text()) + except json.JSONDecodeError: + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + if not isinstance(payload, dict): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + return payload + + +def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]: + merged: dict[tuple[str, str, str], dict] = {} + for cookie in existing: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + for cookie in new: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + return list(merged.values()) + + +def _apply_imported_user_agent(persona: Persona, auth_payload: dict | None) -> bool: + if not auth_payload: + return False + + user_agent = str(auth_payload.get("user_agent") or "").strip() + if not user_agent: + return False + + config = dict(persona.config or {}) + if config.get("USER_AGENT") == user_agent: + return False + + config["USER_AGENT"] = user_agent + persona.config = config + persona.save(update_fields=["config"]) + return True diff --git a/archivebox/personas/migrations/0001_initial.py b/archivebox/personas/migrations/0001_initial.py new file mode 100644 index 0000000000..1d913674f1 --- /dev/null +++ b/archivebox/personas/migrations/0001_initial.py @@ -0,0 +1,36 @@ +# Generated by Django 6.0 on 2025-12-31 09:06 + +import archivebox.base_models.models +from archivebox.uuid_compat import uuid7 +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Persona", + fields=[ + ("id", models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)), + ("config", models.JSONField(blank=True, default=dict, null=True)), + ("name", models.CharField(max_length=64, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ( + "created_by", + models.ForeignKey( + default=archivebox.base_models.models.get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + ), + ] diff --git a/archivebox/personas/migrations/0002_alter_persona_id.py b/archivebox/personas/migrations/0002_alter_persona_id.py new file mode 100644 index 0000000000..d105a15bd0 --- /dev/null +++ b/archivebox/personas/migrations/0002_alter_persona_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("personas", "0001_initial"), + ] + + operations = [ + migrations.AlterField( + model_name="persona", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/personas/migrations/__init__.py b/archivebox/personas/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py new file mode 100644 index 0000000000..2b875b9b92 --- /dev/null +++ b/archivebox/personas/models.py @@ -0,0 +1,291 @@ +""" +Persona management for ArchiveBox. + +A Persona represents a browser profile/identity used for archiving. +Each persona has its own: +- Chrome user data directory (for cookies, localStorage, extensions, etc.) +- Chrome extensions directory +- Cookies file +- Config overrides +""" + +__package__ = "archivebox.personas" + +import shutil +import subprocess +import sys +from contextlib import contextmanager +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from django.db import models +from django.conf import settings +from django.utils import timezone + +from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk +from archivebox.uuid_compat import uuid7 + +_fcntl: Any | None = None +try: + import fcntl as _fcntl_import +except ImportError: # pragma: no cover + pass +else: + _fcntl = _fcntl_import + +if TYPE_CHECKING: + import fcntl +else: + fcntl = _fcntl + + +VOLATILE_PROFILE_DIR_NAMES = { + "Cache", + "Code Cache", + "GPUCache", + "ShaderCache", + "Service Worker", + "GCM Store", + "Crashpad", + "BrowserMetrics", +} + +VOLATILE_PROFILE_FILE_NAMES = { + "BrowserMetrics-spare.pma", + "SingletonCookie", + "SingletonLock", + "SingletonSocket", +} + + +class Persona(ModelWithConfig): + """ + Browser persona/profile for archiving sessions. + + Each persona provides: + - CHROME_USER_DATA_DIR: Chrome profile directory + - CHROME_EXTENSIONS_DIR: Installed extensions directory + - CHROME_DOWNLOADS_DIR: Chrome downloads directory + - COOKIES_FILE: Cookies file for wget/curl + - config: JSON field with persona-specific config overrides + + Usage: + # Get persona and its derived config + config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) + chrome_dir = config['CHROME_USER_DATA_DIR'] + + # Or access directly from persona + persona = Persona.objects.get(name='Default') + persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data + """ + + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + name = models.CharField(max_length=64, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + + class Meta(ModelWithConfig.Meta): + app_label = "personas" + + def __str__(self) -> str: + return self.name + + @property + def path(self) -> Path: + """Path to persona directory under PERSONAS_DIR.""" + from archivebox.config.constants import CONSTANTS + + return CONSTANTS.PERSONAS_DIR / self.name + + @property + def CHROME_USER_DATA_DIR(self) -> str: + """Derived path to Chrome user data directory for this persona.""" + return str(self.path / "chrome_user_data") + + @property + def CHROME_EXTENSIONS_DIR(self) -> str: + """Derived path to Chrome extensions directory for this persona.""" + return str(self.path / "chrome_extensions") + + @property + def CHROME_DOWNLOADS_DIR(self) -> str: + """Derived path to Chrome downloads directory for this persona.""" + return str(self.path / "chrome_downloads") + + @property + def COOKIES_FILE(self) -> str: + """Derived path to cookies.txt file for this persona (if exists).""" + cookies_path = self.path / "cookies.txt" + return str(cookies_path) if cookies_path.exists() else "" + + @property + def AUTH_STORAGE_FILE(self) -> str: + """Derived path to auth.json for this persona (if it exists).""" + auth_path = self.path / "auth.json" + return str(auth_path) if auth_path.exists() else "" + + def get_derived_config(self) -> dict: + """ + Get config dict with derived paths filled in. + + Returns dict with: + - All values from self.config JSONField + - CHROME_USER_DATA_DIR (derived from persona path) + - CHROME_EXTENSIONS_DIR (derived from persona path) + - CHROME_DOWNLOADS_DIR (derived from persona path) + - COOKIES_FILE (derived from persona path, if file exists) + - AUTH_STORAGE_FILE (derived from persona path, if file exists) + - ACTIVE_PERSONA (set to this persona's name) + """ + derived = dict(self.config or {}) + + # Add derived paths (don't override if explicitly set in config) + if "CHROME_USER_DATA_DIR" not in derived: + derived["CHROME_USER_DATA_DIR"] = self.CHROME_USER_DATA_DIR + if "CHROME_EXTENSIONS_DIR" not in derived: + derived["CHROME_EXTENSIONS_DIR"] = self.CHROME_EXTENSIONS_DIR + if "CHROME_DOWNLOADS_DIR" not in derived: + derived["CHROME_DOWNLOADS_DIR"] = self.CHROME_DOWNLOADS_DIR + if "COOKIES_FILE" not in derived and self.COOKIES_FILE: + derived["COOKIES_FILE"] = self.COOKIES_FILE + if "AUTH_STORAGE_FILE" not in derived and self.AUTH_STORAGE_FILE: + derived["AUTH_STORAGE_FILE"] = self.AUTH_STORAGE_FILE + + # Always set ACTIVE_PERSONA to this persona's name + derived["ACTIVE_PERSONA"] = self.name + + return derived + + def ensure_dirs(self) -> None: + """Create persona directories if they don't exist.""" + self.path.mkdir(parents=True, exist_ok=True) + (self.path / "chrome_user_data").mkdir(parents=True, exist_ok=True) + (self.path / "chrome_extensions").mkdir(parents=True, exist_ok=True) + (self.path / "chrome_downloads").mkdir(parents=True, exist_ok=True) + + def cleanup_chrome_profile(self, profile_dir: Path) -> bool: + """Remove volatile Chrome state that should never be reused across launches.""" + cleaned = False + + if not profile_dir.exists(): + return False + + for path in profile_dir.rglob("*"): + if path.name in VOLATILE_PROFILE_FILE_NAMES: + try: + path.unlink() + cleaned = True + except OSError: + pass + + for dirname in VOLATILE_PROFILE_DIR_NAMES: + for path in profile_dir.rglob(dirname): + if not path.is_dir(): + continue + shutil.rmtree(path, ignore_errors=True) + cleaned = True + + for path in profile_dir.rglob("*.log"): + try: + path.unlink() + cleaned = True + except OSError: + pass + + return cleaned + + def cleanup_chrome(self) -> bool: + """Clean up volatile Chrome state for this persona's base profile.""" + return self.cleanup_chrome_profile(self.path / "chrome_user_data") + + @contextmanager + def lock_runtime_for_crawl(self): + lock_path = self.path / ".archivebox-crawl-profile.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + + with lock_path.open("w") as lock_file: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) + try: + yield + finally: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + def runtime_root_for_crawl(self, crawl) -> Path: + return Path(crawl.output_dir) / ".persona" / self.name + + def runtime_profile_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / "chrome_user_data" + + def runtime_downloads_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / "chrome_downloads" + + def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + copy_cmd: list[str] | None = None + source_contents = f"{source_dir}/." + + if sys.platform == "darwin": + copy_cmd = ["cp", "-cR", source_contents, str(destination_dir)] + elif sys.platform.startswith("linux"): + copy_cmd = ["cp", "-a", source_contents, str(destination_dir)] + + if copy_cmd: + result = subprocess.run(copy_cmd, capture_output=True, text=True) + if result.returncode == 0: + return + + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True) + + def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = "") -> dict[str, str]: + self.ensure_dirs() + + template_dir = Path(self.CHROME_USER_DATA_DIR) + runtime_root = self.runtime_root_for_crawl(crawl) + runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl) + runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl) + + with self.lock_runtime_for_crawl(): + if not runtime_profile_dir.exists(): + if template_dir.exists() and any(template_dir.iterdir()): + self.copy_chrome_profile(template_dir, runtime_profile_dir) + else: + runtime_profile_dir.mkdir(parents=True, exist_ok=True) + + runtime_downloads_dir.mkdir(parents=True, exist_ok=True) + self.cleanup_chrome_profile(runtime_profile_dir) + + (runtime_root / "persona_name.txt").write_text(self.name) + (runtime_root / "template_dir.txt").write_text(str(template_dir)) + if chrome_binary: + (runtime_root / "chrome_binary.txt").write_text(chrome_binary) + + return { + "CHROME_USER_DATA_DIR": str(runtime_profile_dir), + "CHROME_DOWNLOADS_DIR": str(runtime_downloads_dir), + } + + def cleanup_runtime_for_crawl(self, crawl) -> None: + shutil.rmtree(Path(crawl.output_dir) / ".persona", ignore_errors=True) + + @classmethod + def get_or_create_default(cls) -> "Persona": + """Get or create the Default persona.""" + persona, _ = cls.objects.get_or_create(name="Default") + return persona + + @classmethod + def cleanup_chrome_all(cls) -> int: + """Clean up Chrome state files for all personas.""" + cleaned = 0 + for persona in cls.objects.all(): + if persona.cleanup_chrome(): + cleaned += 1 + return cleaned diff --git a/archivebox/personas/views.py b/archivebox/personas/views.py new file mode 100644 index 0000000000..60f00ef0ef --- /dev/null +++ b/archivebox/personas/views.py @@ -0,0 +1 @@ +# Create your views here. diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 6191ede911..525c4dbe75 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -1,108 +1,197 @@ -from typing import List, Union -from pathlib import Path -from importlib import import_module +""" +Search module for ArchiveBox. -from django.db.models import QuerySet +Search indexing is handled by search backend hooks in plugins: + abx_plugins/plugins/search_backend_*/on_Snapshot__*_index_*.py -from archivebox.index.schema import Link -from archivebox.util import enforce_types -from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +This module provides the query interface that dynamically discovers +search backend plugins using the hooks system. -from .utils import get_indexable_content, log_index_started +Search backends must provide a search.py module with: + - search(query: str) -> List[str] (returns snapshot IDs) + - flush(snapshot_ids: Iterable[str]) -> None +""" -def indexing_enabled(): - return USE_INDEXING_BACKEND +__package__ = "archivebox.search" -def search_backend_enabled(): - return USE_SEARCHING_BACKEND +from typing import Any -def get_backend(): - return f'search.backends.{SEARCH_BACKEND_ENGINE}' +from django.db.models import Case, IntegerField, QuerySet, Value, When -def import_backend(): - backend_string = get_backend() - try: - backend = import_module(backend_string) - except Exception as err: - raise Exception("Could not load '%s' as a backend: %s" % (backend_string, err)) - return backend +from archivebox.misc.util import enforce_types +from archivebox.misc.logging import stderr +from archivebox.config.common import SEARCH_BACKEND_CONFIG -@enforce_types -def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None: - if not indexing_enabled(): - return - if not skip_text_index and texts: - from core.models import Snapshot +# Cache discovered backends to avoid repeated filesystem scans +_search_backends_cache: dict | None = None +SEARCH_MODES = ("meta", "contents", "deep") - snap = Snapshot.objects.filter(url=link.url).first() - backend = import_backend() - if snap: - try: - backend.index(snapshot_id=str(snap.id), texts=texts) - except Exception as err: - stderr() - stderr( - f'[X] The search backend threw an exception={err}:', - color='red', - ) -@enforce_types -def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: - from core.models import Snapshot - - if search_backend_enabled(): - backend = import_backend() - try: - snapshot_ids = backend.search(query) - except Exception as err: - stderr() - stderr( - f'[X] The search backend threw an exception={err}:', - color='red', - ) - raise - else: - # TODO preserve ordering from backend - qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) - return qsearch - - return Snapshot.objects.none() +def get_default_search_mode() -> str: + return "meta" if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == "ripgrep" else "contents" + + +def get_search_mode(search_mode: str | None) -> str: + normalized = (search_mode or "").strip().lower() + return normalized if normalized in SEARCH_MODES else get_default_search_mode() + + +def prioritize_metadata_matches( + base_queryset: QuerySet, + metadata_queryset: QuerySet, + fulltext_queryset: QuerySet, + *, + deep_queryset: QuerySet | None = None, + ordering: list[str] | tuple[str, ...] | None = None, +) -> QuerySet: + metadata_ids = list(metadata_queryset.values_list("pk", flat=True).distinct()) + metadata_id_set = set(metadata_ids) + fulltext_ids = [pk for pk in fulltext_queryset.values_list("pk", flat=True).distinct() if pk not in metadata_id_set] + fulltext_id_set = set(fulltext_ids) + deep_ids = [] + if deep_queryset is not None: + deep_ids = [ + pk for pk in deep_queryset.values_list("pk", flat=True).distinct() if pk not in metadata_id_set and pk not in fulltext_id_set + ] + + if not metadata_ids and not fulltext_ids and not deep_ids: + return base_queryset.none() + + qs = base_queryset.filter(pk__in=[*metadata_ids, *fulltext_ids, *deep_ids]).annotate( + search_rank=Case( + When(pk__in=metadata_ids, then=Value(0)), + When(pk__in=fulltext_ids, then=Value(1)), + default=Value(2), + output_field=IntegerField(), + ), + ) + + if ordering is not None: + qs = qs.order_by("search_rank", *ordering) + + return qs.distinct() + + +def get_available_backends() -> dict: + """ + Discover all available search backend plugins. + + Uses the hooks system to find plugins with search.py modules. + Results are cached after first call. + """ + global _search_backends_cache + + if _search_backends_cache is None: + from archivebox.hooks import get_search_backends + + _search_backends_cache = get_search_backends() + + return _search_backends_cache + + +def get_backend() -> Any: + """ + Get the configured search backend module. + + Discovers available backends via the hooks system and returns + the one matching SEARCH_BACKEND_ENGINE configuration. + + Falls back to 'ripgrep' if configured backend is not found. + """ + backend_name = SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE + backends = get_available_backends() + + if backend_name in backends: + return backends[backend_name] + + # Fallback to ripgrep if available (no index needed) + if "ripgrep" in backends: + return backends["ripgrep"] + + # No backends found + available = list(backends.keys()) + raise RuntimeError( + f'Search backend "{backend_name}" not found. Available backends: {available or "none"}', + ) + @enforce_types -def flush_search_index(snapshots: QuerySet): - if not indexing_enabled() or not snapshots: - return - backend = import_backend() - snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) +def query_search_index(query: str, search_mode: str | None = None) -> QuerySet: + """ + Search for snapshots matching the query. + + Returns a QuerySet of Snapshot objects matching the search. + """ + from archivebox.core.models import Snapshot + + if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: + return Snapshot.objects.none() + + search_mode = "contents" if search_mode is None else get_search_mode(search_mode) + if search_mode == "meta": + return Snapshot.objects.none() + + backends = get_available_backends() + backend_names: list[str] = [] + configured_backend = SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE + if search_mode == "deep": + if "ripgrep" in backends: + backend_names.append("ripgrep") + backend_names.extend(name for name in backends if name != "ripgrep") + elif configured_backend in backends: + backend_names.append(configured_backend) + elif "ripgrep" in backends: + backend_names.append("ripgrep") + else: + get_backend() + return Snapshot.objects.none() + + snapshot_pks: list[str] = [] + errors: list[Exception] = [] + successful_backends = 0 try: - backend.flush(snapshot_ids) + for backend_name in backend_names: + backend = backends[backend_name] + try: + if backend_name == "ripgrep": + snapshot_pks.extend(backend.search(query, search_mode=search_mode)) + else: + snapshot_pks.extend(backend.search(query)) + successful_backends += 1 + except Exception as err: + errors.append(err) + if search_mode != "deep": + raise except Exception as err: stderr() stderr( - f'[X] The search backend threw an exception={err}:', - color='red', + f"[X] The search backend threw an exception={err}:", + color="red", ) + raise + else: + if not successful_backends and errors and search_mode == "deep": + raise errors[0] + return Snapshot.objects.filter(pk__in=list(dict.fromkeys(snapshot_pks))) + @enforce_types -def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): - if not links: +def flush_search_index(snapshots: QuerySet) -> None: + """ + Remove snapshots from the search index. + """ + if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots: return - from core.models import Snapshot, ArchiveResult + backend = get_backend() + snapshot_pks = [str(pk) for pk in snapshots.values_list("pk", flat=True)] - for link in links: - snap = Snapshot.objects.filter(url=link.url).first() - if snap: - results = ArchiveResult.objects.indexable().filter(snapshot=snap) - log_index_started(link.url) - try: - texts = get_indexable_content(results) - except Exception as err: - stderr() - stderr( - f'[X] An Exception ocurred reading the indexable content={err}:', - color='red', - ) - else: - write_search_index(link, texts, out_dir=out_dir) + try: + backend.flush(snapshot_pks) + except Exception as err: + stderr() + stderr( + f"[X] The search backend threw an exception={err}:", + color="red", + ) diff --git a/archivebox/search/admin.py b/archivebox/search/admin.py new file mode 100644 index 0000000000..2715a5a987 --- /dev/null +++ b/archivebox/search/admin.py @@ -0,0 +1,59 @@ +__package__ = "archivebox.search" + +from django.contrib import messages +from django.contrib import admin +from django.contrib.admin.views.main import ChangeList, ORDER_VAR + +from archivebox.search import get_default_search_mode, get_search_mode, prioritize_metadata_matches, query_search_index + + +class SearchResultsChangeList(ChangeList): + def get_filters_params(self, params=None): + lookup_params = super().get_filters_params(params) + lookup_params.pop("search_mode", None) + return lookup_params + + +class SearchResultsAdminMixin(admin.ModelAdmin): + show_search_mode_selector = True + + def get_changelist(self, request, **kwargs): + return SearchResultsChangeList + + def get_default_search_mode(self): + return get_default_search_mode() + + def get_search_results(self, request, queryset, search_term: str): + """Enhances the search queryset with results from the search backend""" + + qs, use_distinct = super().get_search_results(request, queryset, search_term) + + search_term = search_term.strip() + if not search_term: + return qs.distinct(), use_distinct + search_mode = get_search_mode(request.GET.get("search_mode")) + if search_mode == "meta": + return qs.distinct(), use_distinct + try: + deep_qsearch = None + if search_mode == "deep": + qsearch = query_search_index(search_term, search_mode="contents") + deep_qsearch = query_search_index(search_term, search_mode="deep") + else: + qsearch = query_search_index(search_term, search_mode=search_mode) + qs = prioritize_metadata_matches( + queryset, + qs, + qsearch, + deep_queryset=deep_qsearch, + ordering=() if not request.GET.get(ORDER_VAR) else None, + ) + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + messages.add_message( + request, + messages.WARNING, + f"Error from the search backend, only showing results from default admin search fields - Error: {err}", + ) + + return qs.distinct(), use_distinct diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py deleted file mode 100644 index 3793cf172a..0000000000 --- a/archivebox/search/backends/ripgrep.py +++ /dev/null @@ -1,45 +0,0 @@ -import re -from subprocess import run, PIPE -from typing import List, Generator - -from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION, SEARCH_BACKEND_TIMEOUT -from archivebox.util import enforce_types - -RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') - -RG_ADD_TYPE = '--type-add' -RG_IGNORE_ARGUMENTS = f"ignore:*.{{{','.join(RG_IGNORE_EXTENSIONS)}}}" -RG_DEFAULT_ARGUMENTS = "-ilTignore" # Case insensitive(i), matching files results(l) -RG_REGEX_ARGUMENT = '-e' - -TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' - -ts_regex = re.compile(TIMESTAMP_REGEX) - -@enforce_types -def index(snapshot_id: str, texts: List[str]): - return - -@enforce_types -def flush(snapshot_ids: Generator[str, None, None]): - return - -@enforce_types -def search(text: str) -> List[str]: - if not RIPGREP_VERSION: - raise Exception("ripgrep binary not found, install ripgrep to use this search backend") - - from core.models import Snapshot - - rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)] - rg = run(rg_cmd, stdout=PIPE, stderr=PIPE, timeout=SEARCH_BACKEND_TIMEOUT) - file_paths = [p.decode() for p in rg.stdout.splitlines()] - timestamps = set() - for path in file_paths: - ts = ts_regex.findall(path) - if ts: - timestamps.add(ts[0]) - - snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] - - return snap_ids diff --git a/archivebox/search/backends/sonic.py b/archivebox/search/backends/sonic.py deleted file mode 100644 index 8bde333ca1..0000000000 --- a/archivebox/search/backends/sonic.py +++ /dev/null @@ -1,44 +0,0 @@ -from typing import List, Generator - -from sonic import IngestClient, SearchClient - -from archivebox.util import enforce_types -from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION - -MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text -MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk -MAX_SONIC_ERRORS_BEFORE_ABORT = 5 - -@enforce_types -def index(snapshot_id: str, texts: List[str]): - error_count = 0 - with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: - for text in texts: - chunks = ( - text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH] - for i in range( - 0, - min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), - MAX_SONIC_TEXT_CHUNK_LENGTH, - ) - ) - try: - for chunk in chunks: - ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk)) - except Exception as err: - print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') - error_count += 1 - if error_count > MAX_SONIC_ERRORS_BEFORE_ABORT: - raise - -@enforce_types -def search(text: str) -> List[str]: - with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: - snap_ids = querycl.query(SONIC_COLLECTION, SONIC_BUCKET, text) - return snap_ids - -@enforce_types -def flush(snapshot_ids: Generator[str, None, None]): - with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: - for id in snapshot_ids: - ingestcl.flush_object(SONIC_COLLECTION, SONIC_BUCKET, str(id)) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py deleted file mode 100644 index 723c7fb5e5..0000000000 --- a/archivebox/search/utils.py +++ /dev/null @@ -1,45 +0,0 @@ -from django.db.models import QuerySet - -from archivebox.util import enforce_types -from archivebox.config import ANSI - -def log_index_started(url): - print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) - print( ) - -def get_file_result_content(res, extra_path, use_pwd=False): - if use_pwd: - fpath = f'{res.pwd}/{res.output}' - else: - fpath = f'{res.output}' - - if extra_path: - fpath = f'{fpath}/{extra_path}' - - with open(fpath, 'r', encoding='utf-8') as file: - data = file.read() - if data: - return [data] - return [] - - -# This should be abstracted by a plugin interface for extractors -@enforce_types -def get_indexable_content(results: QuerySet): - if not results: - return [] - # Only use the first method available - res, method = results.first(), results.first().extractor - if method not in ('readability', 'singlefile', 'dom', 'wget'): - return [] - # This should come from a plugin interface - - # TODO: banish this duplication and get these from the extractor file - if method == 'readability': - return get_file_result_content(res, 'content.txt', use_pwd=True) - elif method == 'singlefile': - return get_file_result_content(res, '', use_pwd=True) - elif method == 'dom': - return get_file_result_content(res, '', use_pwd=True) - elif method == 'wget': - return get_file_result_content(res, '', use_pwd=True) diff --git a/archivebox/services/__init__.py b/archivebox/services/__init__.py new file mode 100644 index 0000000000..8b41348bab --- /dev/null +++ b/archivebox/services/__init__.py @@ -0,0 +1,22 @@ +from .archive_result_service import ArchiveResultService +from .binary_service import BinaryService +from .crawl_service import CrawlService +from .machine_service import MachineService +from .process_service import ProcessService +from .runner import run_binary, run_crawl, run_install, run_pending_crawls +from .snapshot_service import SnapshotService +from .tag_service import TagService + +__all__ = [ + "ArchiveResultService", + "BinaryService", + "CrawlService", + "MachineService", + "ProcessService", + "SnapshotService", + "TagService", + "run_binary", + "run_crawl", + "run_install", + "run_pending_crawls", +] diff --git a/archivebox/services/archive_result_service.py b/archivebox/services/archive_result_service.py new file mode 100644 index 0000000000..2fe4135903 --- /dev/null +++ b/archivebox/services/archive_result_service.py @@ -0,0 +1,316 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from collections.abc import Iterable +from pathlib import Path +from typing import Any + +from asgiref.sync import sync_to_async +from django.utils import timezone + +from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent +from abx_dl.output_files import guess_mimetype +from abx_dl.services.base import BaseService + +from .process_service import parse_event_datetime + + +def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]: + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"} + output_files: dict[str, dict] = {} + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + + if not plugin_dir.exists(): + return output_files, total_size, "" + + for file_path in plugin_dir.rglob("*"): + if not file_path.is_file(): + continue + if ".hooks" in file_path.parts: + continue + if file_path.name in exclude_names: + continue + try: + stat = file_path.stat() + except OSError: + continue + mime_type = guess_mimetype(file_path) or "application/octet-stream" + relative_path = str(file_path.relative_to(plugin_dir)) + output_files[relative_path] = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + + output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return output_files, total_size, output_mimetypes + + +def _coerce_output_file_size(value: Any) -> int: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return 0 + + +def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]: + def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]: + normalized = dict(metadata) + if "extension" not in normalized: + normalized["extension"] = Path(path).suffix.lower().lstrip(".") + if "mimetype" not in normalized: + guessed = guess_mimetype(path) + if guessed: + normalized["mimetype"] = guessed + return normalized + + if raw_output_files is None: + return {} + + if isinstance(raw_output_files, str): + try: + raw_output_files = json.loads(raw_output_files) + except json.JSONDecodeError: + return {} + + if isinstance(raw_output_files, dict): + normalized: dict[str, dict] = {} + for path, metadata in raw_output_files.items(): + if not path: + continue + metadata_dict = dict(metadata) if isinstance(metadata, dict) else {} + metadata_dict.pop("path", None) + normalized[str(path)] = _enrich_metadata(str(path), metadata_dict) + return normalized + + if not isinstance(raw_output_files, Iterable): + return {} + + normalized: dict[str, dict] = {} + for item in raw_output_files: + if isinstance(item, str): + normalized[item] = _enrich_metadata(item, {}) + continue + if hasattr(item, "model_dump"): + item = item.model_dump() + elif hasattr(item, "path"): + item = { + "path": getattr(item, "path", ""), + "extension": getattr(item, "extension", ""), + "mimetype": getattr(item, "mimetype", ""), + "size": getattr(item, "size", 0), + } + if not isinstance(item, dict): + continue + path = str(item.get("path") or "").strip() + if not path: + continue + normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")}) + + return normalized + + +def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool: + return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values()) + + +def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]: + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + + for metadata in output_files.values(): + if not isinstance(metadata, dict): + continue + size = _coerce_output_file_size(metadata.get("size")) + mimetype = str(metadata.get("mimetype") or "").strip() + total_size += size + if mimetype and size: + mime_sizes[mimetype] += size + + output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return total_size, output_mimetypes + + +def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]: + normalized_output_files = _normalize_output_files(raw_output_files) + if normalized_output_files and _has_structured_output_metadata(normalized_output_files): + output_size, output_mimetypes = _summarize_output_files(normalized_output_files) + return normalized_output_files, output_size, output_mimetypes + return _collect_output_metadata(plugin_dir) + + +def _normalize_status(status: str) -> str: + if status == "noresult": + return "noresults" + return status or "failed" + + +def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str: + title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip() + if not title: + return "" + if title.lower() in {"pending...", "no title found"}: + return "" + if title == snapshot_url: + return "" + if "/" in title and title.lower().endswith(".txt"): + return "" + return title + + +def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str: + if plugin != "title": + return "" + + title_file = Path(snapshot_output_dir) / "title" / "title.txt" + if title_file.exists(): + try: + file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url) + except OSError: + file_title = "" + if file_title: + return file_title + + return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url) + + +def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool: + current = (current_title or "").strip() + if not current or current.lower() == "pending..." or current == snapshot_url: + return True + return len(next_title) > len(current) + + +def _has_content_files(output_files: Any) -> bool: + return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files)) + + +def _iter_archiveresult_records(stdout: str) -> list[dict]: + records: list[dict] = [] + for raw_line in stdout.splitlines(): + line = raw_line.strip() + if not line.startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "ArchiveResult": + records.append(record) + return records + + +class ArchiveResultService(BaseService): + LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db) + self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db) + + async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None: + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Process + + snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst() + if snapshot is None: + return + plugin_dir = Path(snapshot.output_dir) / event.plugin + output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir) + process_started = await self.bus.find( + ProcessStartedEvent, + past=True, + future=False, + where=lambda candidate: self.bus.event_is_child_of(event, candidate), + ) + process = None + if process_started is not None: + started_at = parse_event_datetime(process_started.start_ts) + if started_at is None: + raise ValueError("ProcessStartedEvent.start_ts is required") + process_query = Process.objects.filter( + pwd=process_started.output_dir, + cmd=[process_started.hook_path, *process_started.hook_args], + started_at=started_at, + ) + if process_started.pid: + process_query = process_query.filter(pid=process_started.pid) + process = await process_query.order_by("-modified_at").afirst() + + result, _created = await ArchiveResult.objects.aget_or_create( + snapshot=snapshot, + plugin=event.plugin, + hook_name=event.hook_name, + defaults={ + "status": ArchiveResult.StatusChoices.STARTED, + "process": process, + }, + ) + + result.process = process or result.process + result.status = _normalize_status(event.status) + result.output_str = event.output_str + result.output_json = event.output_json + result.output_files = output_files + result.output_size = output_size + result.output_mimetypes = output_mimetypes + result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now() + result.end_ts = parse_event_datetime(event.end_ts) or timezone.now() + if event.error: + result.notes = event.error + await result.asave() + + next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url) + if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url): + snapshot.title = next_title + await snapshot.asave(update_fields=["title", "modified_at"]) + + async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None: + if not event.hook_name.startswith("on_Snapshot"): + return + snapshot_event = await self.bus.find( + SnapshotEvent, + past=True, + future=False, + where=lambda candidate: self.bus.event_is_child_of(event, candidate), + ) + if snapshot_event is None: + return + + records = _iter_archiveresult_records(event.stdout) + if records: + for record in records: + await self.bus.emit( + ArchiveResultEvent( + snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id, + plugin=record.get("plugin") or event.plugin_name, + hook_name=record.get("hook_name") or event.hook_name, + status=record.get("status") or "", + output_str=record.get("output_str") or "", + output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None, + output_files=event.output_files, + start_ts=event.start_ts, + end_ts=event.end_ts, + error=record.get("error") or (event.stderr if event.exit_code != 0 else ""), + ), + ) + return + + await self.bus.emit( + ArchiveResultEvent( + snapshot_id=snapshot_event.snapshot_id, + plugin=event.plugin_name, + hook_name=event.hook_name, + status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"), + output_str=event.stderr if event.exit_code != 0 else "", + output_files=event.output_files, + start_ts=event.start_ts, + end_ts=event.end_ts, + error=event.stderr if event.exit_code != 0 else "", + ), + ) diff --git a/archivebox/services/binary_service.py b/archivebox/services/binary_service.py new file mode 100644 index 0000000000..5b18e39161 --- /dev/null +++ b/archivebox/services/binary_service.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +from asgiref.sync import sync_to_async + +from abx_dl.events import BinaryRequestEvent, BinaryEvent +from abx_dl.services.base import BaseService + + +class BinaryService(BaseService): + LISTENS_TO = [BinaryRequestEvent, BinaryEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent) + self.bus.on(BinaryEvent, self.on_BinaryEvent) + + async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None: + from archivebox.machine.models import Binary, Machine + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + existing = await Binary.objects.filter(machine=machine, name=event.name).afirst() + if existing and existing.status == Binary.StatusChoices.INSTALLED: + changed = False + if event.binproviders and existing.binproviders != event.binproviders: + existing.binproviders = event.binproviders + changed = True + if event.overrides and existing.overrides != event.overrides: + existing.overrides = event.overrides + changed = True + if changed: + await existing.asave(update_fields=["binproviders", "overrides", "modified_at"]) + elif existing is None: + await Binary.objects.acreate( + machine=machine, + name=event.name, + binproviders=event.binproviders, + overrides=event.overrides or {}, + status=Binary.StatusChoices.QUEUED, + ) + + installed = ( + await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + .afirst() + ) + cached = None + if installed is not None: + cached = { + "abspath": installed.abspath, + "version": installed.version or "", + "sha256": installed.sha256 or "", + "binproviders": installed.binproviders or "", + "binprovider": installed.binprovider or "", + "machine_id": str(installed.machine_id), + "overrides": installed.overrides or {}, + } + if cached is not None: + await self.bus.emit( + BinaryEvent( + name=event.name, + plugin_name=event.plugin_name, + hook_name=event.hook_name, + abspath=cached["abspath"], + version=cached["version"], + sha256=cached["sha256"], + binproviders=event.binproviders or cached["binproviders"], + binprovider=cached["binprovider"], + overrides=event.overrides or cached["overrides"], + binary_id=event.binary_id, + machine_id=cached["machine_id"], + ), + ) + + async def on_BinaryEvent(self, event: BinaryEvent) -> None: + from archivebox.machine.models import Binary, Machine + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + binary, _ = await Binary.objects.aget_or_create( + machine=machine, + name=event.name, + defaults={ + "status": Binary.StatusChoices.QUEUED, + }, + ) + binary.abspath = event.abspath + if event.version: + binary.version = event.version + if event.sha256: + binary.sha256 = event.sha256 + if event.binproviders: + binary.binproviders = event.binproviders + if event.binprovider: + binary.binprovider = event.binprovider + if event.overrides and binary.overrides != event.overrides: + binary.overrides = event.overrides + binary.status = Binary.StatusChoices.INSTALLED + binary.retry_at = None + await binary.asave( + update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"], + ) diff --git a/archivebox/services/crawl_service.py b/archivebox/services/crawl_service.py new file mode 100644 index 0000000000..fd81f7e67c --- /dev/null +++ b/archivebox/services/crawl_service.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent +from abx_dl.services.base import BaseService + + +class CrawlService(BaseService): + LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent] + EMITS = [] + + def __init__(self, bus, *, crawl_id: str): + self.crawl_id = crawl_id + super().__init__(bus) + self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db) + self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db) + self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db) + self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db) + + async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None: + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.aget(id=self.crawl_id) + if crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.STARTED + crawl.retry_at = None + await crawl.asave(update_fields=["status", "retry_at", "modified_at"]) + + async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None: + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.aget(id=self.crawl_id) + if crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.STARTED + crawl.retry_at = None + await crawl.asave(update_fields=["status", "retry_at", "modified_at"]) + + async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None: + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.aget(id=self.crawl_id) + if crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.STARTED + crawl.retry_at = None + await crawl.asave(update_fields=["status", "retry_at", "modified_at"]) + + async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None: + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.aget(id=self.crawl_id) + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + await crawl.asave(update_fields=["status", "retry_at", "modified_at"]) diff --git a/archivebox/services/live_ui.py b/archivebox/services/live_ui.py new file mode 100644 index 0000000000..a89f016c02 --- /dev/null +++ b/archivebox/services/live_ui.py @@ -0,0 +1,3 @@ +from abx_dl.cli import LiveBusUI + +__all__ = ["LiveBusUI"] diff --git a/archivebox/services/machine_service.py b/archivebox/services/machine_service.py new file mode 100644 index 0000000000..f451ab36c8 --- /dev/null +++ b/archivebox/services/machine_service.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from asgiref.sync import sync_to_async + +from abx_dl.events import MachineEvent +from abx_dl.services.base import BaseService + + +class MachineService(BaseService): + LISTENS_TO = [MachineEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db) + + async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None: + from archivebox.machine.models import Machine, _sanitize_machine_config + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + config = dict(machine.config or {}) + + if event.config is not None: + config.update(_sanitize_machine_config(event.config)) + elif event.method == "update": + key = event.key.replace("config/", "", 1).strip() + if key: + config[key] = event.value + else: + return + + machine.config = _sanitize_machine_config(config) + await machine.asave(update_fields=["config", "modified_at"]) diff --git a/archivebox/services/process_service.py b/archivebox/services/process_service.py new file mode 100644 index 0000000000..cdcb9bbf12 --- /dev/null +++ b/archivebox/services/process_service.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from datetime import datetime +from typing import ClassVar + +from asgiref.sync import sync_to_async +from django.utils import timezone + +from abxbus import BaseEvent +from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent +from abx_dl.services.base import BaseService + + +def parse_event_datetime(value: str | None): + if not value: + return None + try: + dt = datetime.fromisoformat(value) + except ValueError: + return None + if timezone.is_naive(dt): + return timezone.make_aware(dt, timezone.get_current_timezone()) + return dt + + +class ProcessService(BaseService): + LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent] + EMITS: ClassVar[list[type[BaseEvent]]] = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db) + self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db) + + async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None: + from archivebox.machine.models import NetworkInterface, Process + + iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True) + process_type = event.process_type or ( + Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK + ) + worker_type = event.worker_type or "" + started_at = parse_event_datetime(event.start_ts) + if started_at is None: + raise ValueError("ProcessStartedEvent.start_ts is required") + process_query = Process.objects.filter( + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + started_at=started_at, + ) + if event.pid: + process_query = process_query.filter(pid=event.pid) + process = await process_query.order_by("-modified_at").afirst() + if process is None: + process = await Process.objects.acreate( + machine=iface.machine, + iface=iface, + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + env=event.env, + timeout=event.timeout, + pid=event.pid or None, + url=event.url or None, + started_at=started_at, + status=Process.StatusChoices.RUNNING, + retry_at=None, + ) + elif process.iface_id != iface.id or process.machine_id != iface.machine_id: + process.iface = iface + process.machine = iface.machine + await process.asave(update_fields=["iface", "machine", "modified_at"]) + + process.pwd = event.output_dir + process.cmd = [event.hook_path, *event.hook_args] + process.env = event.env + process.timeout = event.timeout + process.pid = event.pid or None + process.url = event.url or process.url + process.process_type = process_type or process.process_type + process.worker_type = worker_type or process.worker_type + process.started_at = started_at + process.status = process.StatusChoices.RUNNING + process.retry_at = None + await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)( + plugin_name=event.plugin_name, + hook_path=event.hook_path, + ) + await process.asave() + + async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None: + from archivebox.machine.models import NetworkInterface, Process + + iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True) + process_type = event.process_type or ( + Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK + ) + worker_type = event.worker_type or "" + started_at = parse_event_datetime(event.start_ts) + if started_at is None: + raise ValueError("ProcessCompletedEvent.start_ts is required") + process_query = Process.objects.filter( + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + started_at=started_at, + ) + if event.pid: + process_query = process_query.filter(pid=event.pid) + process = await process_query.order_by("-modified_at").afirst() + if process is None: + process = await Process.objects.acreate( + machine=iface.machine, + iface=iface, + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + env=event.env, + timeout=event.timeout, + pid=event.pid or None, + url=event.url or None, + started_at=started_at, + status=Process.StatusChoices.RUNNING, + retry_at=None, + ) + elif process.iface_id != iface.id or process.machine_id != iface.machine_id: + process.iface = iface + process.machine = iface.machine + await process.asave(update_fields=["iface", "machine", "modified_at"]) + + process.pwd = event.output_dir + if not process.cmd: + process.cmd = [event.hook_path, *event.hook_args] + process.env = event.env + process.pid = event.pid or process.pid + process.url = event.url or process.url + process.process_type = process_type or process.process_type + process.worker_type = worker_type or process.worker_type + process.started_at = started_at + process.ended_at = parse_event_datetime(event.end_ts) or timezone.now() + process.stdout = event.stdout + process.stderr = event.stderr + process.exit_code = event.exit_code + process.status = process.StatusChoices.EXITED + process.retry_at = None + await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)( + plugin_name=event.plugin_name, + hook_path=event.hook_path, + ) + await process.asave() diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py new file mode 100644 index 0000000000..b142905a84 --- /dev/null +++ b/archivebox/services/runner.py @@ -0,0 +1,892 @@ +from __future__ import annotations + +import asyncio +import json +import os +import shutil +import subprocess +import sys +import time +from contextlib import nullcontext +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any + +from asgiref.sync import sync_to_async +from django.utils import timezone +from rich.console import Console + +from abx_dl.events import BinaryRequestEvent, MachineEvent +from abx_dl.heartbeat import CrawlHeartbeat +from abx_dl.limits import CrawlLimitState +from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins +from abx_dl.orchestrator import ( + create_bus, + download, + install_plugins as abx_install_plugins, + setup_services as setup_abx_services, +) + +from .archive_result_service import ArchiveResultService +from .binary_service import BinaryService +from .crawl_service import CrawlService +from .machine_service import MachineService +from .process_service import ProcessService +from .snapshot_service import SnapshotService +from .tag_service import TagService +from .live_ui import LiveBusUI + + +def _bus_name(prefix: str, identifier: str) -> str: + normalized = "".join(ch if ch.isalnum() else "_" for ch in identifier) + return f"{prefix}_{normalized}" + + +def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int: + selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins + return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name) + + +def _normalize_runtime_config(config: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in json.loads(json.dumps(config, default=str)).items() if value is not None} + + +async def _emit_machine_config( + bus, + *, + config: dict[str, Any], + derived_config: dict[str, Any], +) -> None: + user_config = _normalize_runtime_config(config) + derived_machine_config = _normalize_runtime_config(derived_config) + await bus.emit( + MachineEvent( + config=user_config, + config_type="user", + ), + ) + if derived_machine_config: + await bus.emit( + MachineEvent( + config=derived_machine_config, + config_type="derived", + ), + ) + + +def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool: + if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest: + return False + + from archivebox.config import CONSTANTS + from archivebox.machine.models import Machine, Process + + Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() + machine = Machine.current() + if Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type=Process.TypeChoices.ORCHESTRATOR, + ).exists(): + return False + + log_path = CONSTANTS.LOGS_DIR / "errors.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR)) + + with log_path.open("a", encoding="utf-8") as log_handle: + subprocess.Popen( + [sys.executable, "-m", "archivebox", "run", "--daemon"], + cwd=str(CONSTANTS.DATA_DIR), + env=env, + stdin=subprocess.DEVNULL, + stdout=log_handle, + stderr=log_handle, + start_new_session=True, + ) + return True + + +class CrawlRunner: + MAX_CONCURRENT_SNAPSHOTS = 8 + + def __init__( + self, + crawl, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, + ): + self.crawl = crawl + self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0) + self.plugins = discover_plugins() + ProcessService(self.bus) + BinaryService(self.bus) + TagService(self.bus) + CrawlService(self.bus, crawl_id=str(crawl.id)) + MachineService(self.bus) + self.process_discovered_snapshots_inline = process_discovered_snapshots_inline + + async def ignore_snapshot(_snapshot_id: str) -> None: + return None + + SnapshotService( + self.bus, + crawl_id=str(crawl.id), + schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot, + ) + ArchiveResultService(self.bus) + self.selected_plugins = selected_plugins + self.initial_snapshot_ids = snapshot_ids + self.snapshot_tasks: dict[str, asyncio.Task[None]] = {} + self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS) + self.persona = None + self.base_config: dict[str, Any] = {} + self.derived_config: dict[str, Any] = {} + self.primary_url = "" + self.crawl_output_dir = "" + self._live_stream = None + + async def run(self) -> None: + heartbeat = CrawlHeartbeat( + Path(self.crawl_output_dir), + runtime="archivebox", + crawl_id=str(self.crawl.id), + ) + try: + snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)() + live_ui = self._create_live_ui() + with live_ui if live_ui is not None else nullcontext(): + await heartbeat.start() + if snapshot_ids: + root_snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_ids[0]) + setup_abx_services( + self.bus, + plugins=self.plugins, + url=root_snapshot["url"], + snapshot=AbxSnapshot( + id=root_snapshot["id"], + url=root_snapshot["url"], + depth=int(root_snapshot["depth"]), + crawl_id=str(self.crawl.id), + ), + output_dir=Path(root_snapshot["output_dir"]), + install_enabled=False, + crawl_setup_enabled=False, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=False, + persist_derived=False, + auto_install=True, + emit_jsonl=False, + ) + await _emit_machine_config( + self.bus, + config={ + **self.base_config, + "ABX_RUNTIME": "archivebox", + }, + derived_config=self.derived_config, + ) + if snapshot_ids: + root_snapshot_id = snapshot_ids[0] + await self.run_crawl_setup(root_snapshot_id) + for snapshot_id in snapshot_ids: + await self.enqueue_snapshot(snapshot_id) + await self.wait_for_snapshot_tasks() + await self.run_crawl_cleanup(root_snapshot_id) + finally: + await heartbeat.stop() + await self.bus.stop() + if self._live_stream is not None: + try: + self._live_stream.close() + except Exception: + pass + self._live_stream = None + await sync_to_async(self.finalize_run_state, thread_sensitive=True)() + + async def enqueue_snapshot(self, snapshot_id: str) -> None: + task = self.snapshot_tasks.get(snapshot_id) + if task is not None and not task.done(): + return + task = asyncio.create_task(self.run_snapshot(snapshot_id)) + self.snapshot_tasks[snapshot_id] = task + + async def wait_for_snapshot_tasks(self) -> None: + while True: + pending_tasks: list[asyncio.Task[None]] = [] + for snapshot_id, task in list(self.snapshot_tasks.items()): + if task.done(): + if self.snapshot_tasks.get(snapshot_id) is task: + self.snapshot_tasks.pop(snapshot_id, None) + task.result() + continue + pending_tasks.append(task) + if not pending_tasks: + return + done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + task.result() + + def load_run_state(self) -> list[str]: + from archivebox.config.configset import get_config + from archivebox.hooks import discover_hooks + from archivebox.machine.models import Machine, NetworkInterface, Process + + self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else "" + current_iface = NetworkInterface.current(refresh=True) + current_process = Process.current() + if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id: + current_process.iface = current_iface + current_process.machine = current_iface.machine + current_process.save(update_fields=["iface", "machine", "modified_at"]) + self.persona = self.crawl.resolve_persona() + self.base_config = get_config(crawl=self.crawl) + self.derived_config = dict(Machine.current().config) + self.crawl_output_dir = str(self.crawl.output_dir) + self.base_config["ABX_RUNTIME"] = "archivebox" + if self.selected_plugins is None: + raw_plugins = str(self.base_config.get("PLUGINS") or "").strip() + if raw_plugins: + self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] + else: + runtime_events = ("CrawlSetup", "CrawlCleanup", "Snapshot", "SnapshotCleanup") + runtime_plugins = { + hook.parent.name for event_name in runtime_events for hook in discover_hooks(event_name, config=self.base_config) + } + self.selected_plugins = sorted(runtime_plugins) or None + if self.persona: + self.base_config.update( + self.persona.prepare_runtime_for_crawl( + self.crawl, + chrome_binary=self.base_config["CHROME_BINARY"], + ), + ) + if self.initial_snapshot_ids: + return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids] + created = self.crawl.create_snapshots_from_urls() + snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at")) + return [str(snapshot.id) for snapshot in snapshots] + + def finalize_run_state(self) -> None: + from archivebox.crawls.models import Crawl + + if self.persona: + self.persona.cleanup_runtime_for_crawl(self.crawl) + crawl = Crawl.objects.get(id=self.crawl.id) + if crawl.is_finished(): + if crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + return + if crawl.status == Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.QUEUED + elif crawl.status != Crawl.StatusChoices.STARTED: + crawl.status = Crawl.StatusChoices.STARTED + crawl.retry_at = crawl.retry_at or timezone.now() + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + + def _create_live_ui(self) -> LiveBusUI | None: + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + if not interactive_tty: + return None + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8") + stream = self._live_stream + except OSError: + self._live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)" + live_ui = LiveBusUI( + self.bus, + total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins), + timeout_seconds=self.base_config["TIMEOUT"], + ui_console=ui_console, + interactive_tty=True, + ) + live_ui.print_intro( + url=self.primary_url or "crawl", + output_dir=Path(self.crawl_output_dir), + plugins_label=plugins_label, + ) + return live_ui + + def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]: + from archivebox.core.models import Snapshot + from archivebox.config.configset import get_config + + snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) + config = get_config(crawl=self.crawl, snapshot=snapshot) + config.update(self.base_config) + config["CRAWL_DIR"] = self.crawl_output_dir + config["SNAP_DIR"] = str(snapshot.output_dir) + extra_context: dict[str, Any] = {} + if config.get("EXTRA_CONTEXT"): + parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"])) + if not isinstance(parsed_extra_context, dict): + raise TypeError("EXTRA_CONTEXT must decode to an object") + extra_context = parsed_extra_context + extra_context["snapshot_id"] = str(snapshot.id) + extra_context["snapshot_depth"] = snapshot.depth + config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True) + return { + "id": str(snapshot.id), + "url": snapshot.url, + "title": snapshot.title, + "timestamp": snapshot.timestamp, + "bookmarked_at": snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else "", + "created_at": snapshot.created_at.isoformat() if snapshot.created_at else "", + "tags": snapshot.tags_str(), + "depth": snapshot.depth, + "status": snapshot.status, + "output_dir": str(snapshot.output_dir), + "config": config, + "_snapshot": snapshot, + } + + async def enqueue_discovered_snapshots_from_outputs(self, snapshot_payload: dict[str, Any]) -> None: + from archivebox.core.models import Snapshot + from archivebox.hooks import collect_urls_from_plugins + + if int(snapshot_payload["depth"]) >= self.crawl.max_depth: + return + if CrawlLimitState.from_config(snapshot_payload["config"]).get_stop_reason() == "max_size": + return + + discovered_urls = await sync_to_async(collect_urls_from_plugins, thread_sensitive=True)(Path(snapshot_payload["output_dir"])) + if not discovered_urls: + return + + parent_snapshot = snapshot_payload.get("_snapshot") + if parent_snapshot is None: + parent_snapshot = await sync_to_async( + lambda: Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=snapshot_payload["id"]).first(), + thread_sensitive=True, + )() + if parent_snapshot is None: + return + + for record in discovered_urls: + url = str(record.get("url") or "").strip() + if not url: + continue + passes_filters = await sync_to_async(self.crawl.url_passes_filters, thread_sensitive=True)(url, snapshot=parent_snapshot) + if not passes_filters: + continue + child_snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)( + { + "url": url, + "depth": parent_snapshot.depth + 1, + "title": str(record.get("title") or "").strip(), + "tags": str(record.get("tags") or "").strip(), + "parent_snapshot_id": str(parent_snapshot.id), + "crawl_id": str(self.crawl.id), + }, + overrides={ + "crawl": self.crawl, + "snapshot": parent_snapshot, + "created_by_id": self.crawl.created_by_id, + }, + queue_for_extraction=False, + ) + if child_snapshot is None or child_snapshot.status == child_snapshot.StatusChoices.SEALED: + continue + child_snapshot.status = child_snapshot.StatusChoices.QUEUED + child_snapshot.retry_at = timezone.now() + await child_snapshot.asave(update_fields=["status", "retry_at", "modified_at"]) + if self.process_discovered_snapshots_inline: + await self.enqueue_snapshot(str(child_snapshot.id)) + + async def run_crawl_setup(self, snapshot_id: str) -> None: + snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id) + await download( + url=snapshot["url"], + plugins=self.plugins, + output_dir=Path(snapshot["output_dir"]), + selected_plugins=self.selected_plugins, + config_overrides=_normalize_runtime_config(snapshot["config"]), + derived_config_overrides=_normalize_runtime_config(self.derived_config), + bus=self.bus, + emit_jsonl=False, + install_enabled=True, + crawl_setup_enabled=True, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=False, + MachineService=None, + BinaryService=None, + ProcessService=None, + ArchiveResultService=None, + TagService=None, + ) + + async def run_crawl_cleanup(self, snapshot_id: str) -> None: + snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id) + await download( + bus=self.bus, + url=snapshot["url"], + output_dir=Path(snapshot["output_dir"]), + plugins=self.plugins, + selected_plugins=self.selected_plugins, + config_overrides=_normalize_runtime_config(snapshot["config"]), + derived_config_overrides=_normalize_runtime_config(self.derived_config), + emit_jsonl=False, + install_enabled=False, + crawl_setup_enabled=False, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=True, + MachineService=None, + BinaryService=None, + ProcessService=None, + ArchiveResultService=None, + TagService=None, + ) + + async def run_snapshot(self, snapshot_id: str) -> None: + async with self.snapshot_semaphore: + snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id) + if snapshot["status"] == "sealed": + return + if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size": + await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id) + return + try: + await download( + url=snapshot["url"], + plugins=self.plugins, + output_dir=Path(snapshot["output_dir"]), + selected_plugins=self.selected_plugins, + config_overrides=_normalize_runtime_config(snapshot["config"]), + derived_config_overrides=_normalize_runtime_config(self.derived_config), + bus=self.bus, + emit_jsonl=False, + install_enabled=False, + crawl_setup_enabled=False, + crawl_start_enabled=True, + snapshot_cleanup_enabled=True, + crawl_cleanup_enabled=False, + MachineService=None, + BinaryService=None, + ProcessService=None, + ArchiveResultService=None, + TagService=None, + ) + await self.enqueue_discovered_snapshots_from_outputs(snapshot) + finally: + current_task = asyncio.current_task() + if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task: + self.snapshot_tasks.pop(snapshot_id, None) + + def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(id=snapshot_id).first() + if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED: + return + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + + +def run_crawl( + crawl_id: str, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, +) -> None: + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.get(id=crawl_id) + asyncio.run( + CrawlRunner( + crawl, + snapshot_ids=snapshot_ids, + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=process_discovered_snapshots_inline, + ).run(), + ) + + +async def _run_binary(binary_id: str) -> None: + from archivebox.config.configset import get_config + from archivebox.machine.models import Binary, Machine + + binary = await Binary.objects.aget(id=binary_id) + plugins = discover_plugins() + config = get_config() + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + derived_config = _normalize_runtime_config(dict(machine.config)) + config["ABX_RUNTIME"] = "archivebox" + config = _normalize_runtime_config(config) + bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0) + ProcessService(bus) + BinaryService(bus) + TagService(bus) + ArchiveResultService(bus) + MachineService(bus) + setup_abx_services( + bus, + plugins=plugins, + install_enabled=False, + crawl_setup_enabled=False, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=False, + persist_derived=False, + auto_install=True, + emit_jsonl=False, + ) + await _emit_machine_config(bus, config=config, derived_config=derived_config) + + try: + await bus.emit( + BinaryRequestEvent( + name=binary.name, + plugin_name="archivebox", + hook_name="on_BinaryRequest__archivebox_run", + output_dir=str(binary.output_dir), + binary_id=str(binary.id), + machine_id=str(binary.machine_id), + binproviders=binary.binproviders, + overrides=binary.overrides or None, + ), + ) + finally: + await bus.stop() + + +def run_binary(binary_id: str) -> None: + asyncio.run(_run_binary(binary_id)) + + +async def _run_install(plugin_names: list[str] | None = None) -> None: + from archivebox.config.configset import get_config + from archivebox.machine.models import Machine + + plugins = discover_plugins() + config = get_config() + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + derived_config = _normalize_runtime_config(dict(machine.config)) + config["ABX_RUNTIME"] = "archivebox" + config = _normalize_runtime_config(config) + bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0) + ProcessService(bus) + BinaryService(bus) + TagService(bus) + ArchiveResultService(bus) + MachineService(bus) + await _emit_machine_config(bus, config=config, derived_config=derived_config) + live_stream = None + + try: + selected_plugins = filter_plugins(plugins, list(plugin_names), include_providers=True) if plugin_names else plugins + if not selected_plugins: + return + plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)" + timeout_seconds = config["TIMEOUT"] + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + ui_console = None + live_ui = None + + if interactive_tty: + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8") + stream = live_stream + except OSError: + live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + + with TemporaryDirectory(prefix="archivebox-install-") as temp_dir: + output_dir = Path(temp_dir) + if ui_console is not None: + live_ui = LiveBusUI( + bus, + total_hooks=_count_selected_hooks(selected_plugins, None), + timeout_seconds=timeout_seconds, + ui_console=ui_console, + interactive_tty=interactive_tty, + ) + live_ui.print_intro( + url="install", + output_dir=output_dir, + plugins_label=plugins_label, + ) + with live_ui if live_ui is not None else nullcontext(): + await abx_install_plugins( + plugin_names=plugin_names, + plugins=plugins, + output_dir=output_dir, + config_overrides=config, + derived_config_overrides=derived_config, + emit_jsonl=False, + bus=bus, + MachineService=None, + ) + if live_ui is not None: + live_ui.print_summary(output_dir=output_dir) + finally: + await bus.stop() + try: + if live_stream is not None: + live_stream.close() + except Exception: + pass + + +def run_install(*, plugin_names: list[str] | None = None) -> None: + asyncio.run(_run_install(plugin_names=plugin_names)) + + +def recover_orphaned_crawls() -> int: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.machine.models import Process + + active_crawl_ids: set[str] = set() + orphaned_crawls = list( + Crawl.objects.filter( + status=Crawl.StatusChoices.STARTED, + retry_at__isnull=True, + ).prefetch_related("snapshot_set"), + ) + running_processes = Process.objects.filter( + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.WORKER, + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ).only("pwd") + + for proc in running_processes: + if not proc.pwd: + continue + proc_pwd = Path(proc.pwd) + for crawl in orphaned_crawls: + matched_snapshot = None + for snapshot in crawl.snapshot_set.all(): + try: + proc_pwd.relative_to(snapshot.output_dir) + matched_snapshot = snapshot + break + except ValueError: + continue + if matched_snapshot is not None: + active_crawl_ids.add(str(crawl.id)) + break + + recovered = 0 + now = timezone.now() + for crawl in orphaned_crawls: + if str(crawl.id) in active_crawl_ids: + continue + + snapshots = list(crawl.snapshot_set.all()) + if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots): + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + continue + + crawl.retry_at = now + crawl.save(update_fields=["retry_at", "modified_at"]) + recovered += 1 + + return recovered + + +def recover_orphaned_snapshots() -> int: + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Process + + active_snapshot_ids: set[str] = set() + orphaned_snapshots = list( + Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True) + .select_related("crawl") + .prefetch_related("archiveresult_set"), + ) + running_processes = Process.objects.filter( + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.WORKER, + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ).only("pwd") + + for proc in running_processes: + if not proc.pwd: + continue + proc_pwd = Path(proc.pwd) + for snapshot in orphaned_snapshots: + try: + proc_pwd.relative_to(snapshot.output_dir) + active_snapshot_ids.add(str(snapshot.id)) + break + except ValueError: + continue + + recovered = 0 + now = timezone.now() + for snapshot in orphaned_snapshots: + if str(snapshot.id) in active_snapshot_ids: + continue + + results = list(snapshot.archiveresult_set.all()) + if results and all(result.status in ArchiveResult.FINAL_STATES for result in results): + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.downloaded_at = snapshot.downloaded_at or now + snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"]) + + crawl = snapshot.crawl + if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + continue + + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = now + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + + crawl = snapshot.crawl + crawl.status = Crawl.StatusChoices.QUEUED + crawl.retry_at = now + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + + return recovered + + +def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int: + from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.core.models import Snapshot + from archivebox.machine.models import Binary + + while True: + if daemon and crawl_id is None: + now = timezone.now() + for schedule in CrawlSchedule.objects.filter(is_enabled=True).select_related("template", "template__created_by"): + if schedule.is_due(now): + schedule.enqueue(queued_at=now) + + queued_crawls = Crawl.objects.filter( + retry_at__lte=timezone.now(), + status=Crawl.StatusChoices.QUEUED, + ) + if crawl_id: + queued_crawls = queued_crawls.filter(id=crawl_id) + queued_crawls = queued_crawls.order_by("retry_at", "created_at") + + queued_crawl = queued_crawls.first() + if queued_crawl is not None: + if not queued_crawl.claim_processing_lock(lock_seconds=60): + continue + run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False) + continue + + if crawl_id is None: + snapshot = ( + Snapshot.objects.filter(retry_at__lte=timezone.now()) + .exclude(status=Snapshot.StatusChoices.SEALED) + .select_related("crawl") + .order_by("retry_at", "created_at") + .first() + ) + if snapshot is not None: + if not snapshot.claim_processing_lock(lock_seconds=60): + continue + run_crawl( + str(snapshot.crawl_id), + snapshot_ids=[str(snapshot.id)], + process_discovered_snapshots_inline=False, + ) + continue + + if crawl_id is None: + # Standalone binary backlog should not starve queued crawls or snapshots. + # Crawl.run() already claims and installs crawl-declared Binary rows as needed. + binary = ( + Binary.objects.filter(retry_at__lte=timezone.now()) + .exclude(status=Binary.StatusChoices.INSTALLED) + .order_by("retry_at", "created_at") + .first() + ) + if binary is not None: + if not binary.claim_processing_lock(lock_seconds=60): + continue + run_binary(str(binary.id)) + continue + + pending = Crawl.objects.filter( + retry_at__lte=timezone.now(), + status=Crawl.StatusChoices.STARTED, + ) + if crawl_id: + pending = pending.filter(id=crawl_id) + pending = pending.order_by("retry_at", "created_at") + + crawl = pending.first() + if crawl is None: + if daemon: + time.sleep(2.0) + continue + return 0 + + if not crawl.claim_processing_lock(lock_seconds=60): + continue + + run_crawl(str(crawl.id), process_discovered_snapshots_inline=False) diff --git a/archivebox/services/snapshot_service.py b/archivebox/services/snapshot_service.py new file mode 100644 index 0000000000..a82b2d744a --- /dev/null +++ b/archivebox/services/snapshot_service.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +from pathlib import Path + +from asgiref.sync import sync_to_async +from django.utils import timezone + +from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent +from abx_dl.limits import CrawlLimitState +from abx_dl.services.base import BaseService + + +class SnapshotService(BaseService): + LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent] + EMITS = [] + + def __init__(self, bus, *, crawl_id: str, schedule_snapshot): + self.crawl_id = crawl_id + self.schedule_snapshot = schedule_snapshot + super().__init__(bus) + self.bus.on(SnapshotEvent, self.on_SnapshotEvent) + self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent) + + async def _upsert_discovered_snapshot(self, parent_snapshot, *, url: str, depth: int, title: str = "", tags: str = "") -> str | None: + from archivebox.core.models import Snapshot + + crawl = parent_snapshot.crawl + if depth > crawl.max_depth: + return None + stop_reason = await sync_to_async(self._crawl_limit_stop_reason, thread_sensitive=True)(crawl) + if stop_reason == "max_size": + return None + passes_filters = await sync_to_async(crawl.url_passes_filters, thread_sensitive=True)(url, snapshot=parent_snapshot) + if not passes_filters: + return None + + snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)( + { + "url": url, + "depth": depth, + "title": title, + "tags": tags, + "parent_snapshot_id": str(parent_snapshot.id), + "crawl_id": str(crawl.id), + }, + overrides={ + "crawl": crawl, + "snapshot": parent_snapshot, + "created_by_id": crawl.created_by_id, + }, + queue_for_extraction=False, + ) + if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED: + return None + + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + await snapshot.asave(update_fields=["status", "retry_at", "modified_at"]) + return str(snapshot.id) + + async def on_SnapshotEvent(self, event: SnapshotEvent) -> None: + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.aget(id=self.crawl_id) + snapshot_id: str | None = None + snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst() + + if snapshot is not None: + snapshot.status = Snapshot.StatusChoices.STARTED + snapshot.retry_at = None + await snapshot.asave(update_fields=["status", "retry_at", "modified_at"]) + snapshot_id = str(snapshot.id) + elif event.depth > 0: + parent_event = await self.bus.find( + SnapshotEvent, + past=True, + future=False, + where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate), + ) + parent_snapshot = None + if parent_event is not None: + parent_snapshot = ( + await Snapshot.objects.select_related("crawl", "crawl__created_by") + .filter(id=parent_event.snapshot_id, crawl=crawl) + .afirst() + ) + if parent_snapshot is not None: + snapshot_id = await self._upsert_discovered_snapshot( + parent_snapshot, + url=event.url, + depth=event.depth, + ) + + if snapshot_id: + snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst() + if snapshot is not None: + await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)() + if snapshot_id and event.depth > 0: + await self.schedule_snapshot(snapshot_id) + + async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None: + from archivebox.core.models import Snapshot + + snapshot = await Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=event.snapshot_id).afirst() + snapshot_id: str | None = None + if snapshot is not None: + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.downloaded_at = snapshot.downloaded_at or timezone.now() + await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"]) + stop_reason = await sync_to_async(self._crawl_limit_stop_reason, thread_sensitive=True)(snapshot.crawl) + if snapshot.crawl_id and stop_reason == "max_size": + await ( + Snapshot.objects.filter( + crawl_id=snapshot.crawl_id, + status=Snapshot.StatusChoices.QUEUED, + ) + .exclude(id=snapshot.id) + .aupdate( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + ) + snapshot_id = str(snapshot.id) + if snapshot_id: + snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst() + if snapshot is not None: + await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)() + await sync_to_async(snapshot.write_json_details, thread_sensitive=True)() + await sync_to_async(snapshot.write_html_details, thread_sensitive=True)() + stop_reason = await sync_to_async(self._crawl_limit_stop_reason, thread_sensitive=True)(snapshot.crawl) + if snapshot.depth < snapshot.crawl.max_depth and stop_reason != "max_size": + from archivebox.hooks import collect_urls_from_plugins + + discovered_urls = await sync_to_async(collect_urls_from_plugins, thread_sensitive=True)(Path(snapshot.output_dir)) + for record in discovered_urls: + discovered_snapshot_id = await self._upsert_discovered_snapshot( + snapshot, + url=str(record.get("url") or "").strip(), + depth=snapshot.depth + 1, + title=str(record.get("title") or "").strip(), + tags=str(record.get("tags") or "").strip(), + ) + if discovered_snapshot_id: + await self.schedule_snapshot(discovered_snapshot_id) + + def _crawl_limit_stop_reason(self, crawl) -> str: + config = dict(crawl.config or {}) + config["CRAWL_DIR"] = str(crawl.output_dir) + return CrawlLimitState.from_config(config).get_stop_reason() diff --git a/archivebox/services/tag_service.py b/archivebox/services/tag_service.py new file mode 100644 index 0000000000..22d6685dc7 --- /dev/null +++ b/archivebox/services/tag_service.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from abx_dl.events import TagEvent +from abx_dl.services.base import BaseService + + +class TagService(BaseService): + LISTENS_TO = [TagEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(TagEvent, self.on_TagEvent__save_to_db) + + async def on_TagEvent__save_to_db(self, event: TagEvent) -> None: + from archivebox.core.models import Snapshot, SnapshotTag, Tag + + snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst() + if snapshot is None: + return + tag, _ = await Tag.objects.aget_or_create(name=event.name) + await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag) diff --git a/archivebox/static b/archivebox/static new file mode 120000 index 0000000000..5d01044d31 --- /dev/null +++ b/archivebox/static @@ -0,0 +1 @@ +templates/static \ No newline at end of file diff --git a/archivebox/system.py b/archivebox/system.py deleted file mode 100644 index 2dd12297e3..0000000000 --- a/archivebox/system.py +++ /dev/null @@ -1,206 +0,0 @@ -__package__ = 'archivebox' - - -import os -import signal -import shutil - -from json import dump -from pathlib import Path -from typing import Optional, Union, Set, Tuple -from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired - -from crontab import CronTab -from .vendor.atomicwrites import atomic_write as lib_atomic_write - -from .util import enforce_types, ExtendedEncoder -from .config import OUTPUT_PERMISSIONS - - - -def run(*args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs): - """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective - Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py - """ - - if input is not None: - if kwargs.get('stdin') is not None: - raise ValueError('stdin and input arguments may not both be used.') - kwargs['stdin'] = PIPE - - if capture_output: - if ('stdout' in kwargs) or ('stderr' in kwargs): - raise ValueError('stdout and stderr arguments may not be used ' - 'with capture_output.') - kwargs['stdout'] = PIPE - kwargs['stderr'] = PIPE - - pgid = None - try: - with Popen(*args, start_new_session=start_new_session, **kwargs) as process: - pgid = os.getpgid(process.pid) - try: - stdout, stderr = process.communicate(input, timeout=timeout) - except TimeoutExpired as exc: - process.kill() - if _mswindows: - # Windows accumulates the output in a single blocking - # read() call run on child threads, with the timeout - # being done in a join() on those threads. communicate() - # _after_ kill() is required to collect that and add it - # to the exception. - exc.stdout, exc.stderr = process.communicate() - else: - # POSIX _communicate already populated the output so - # far into the TimeoutExpired exception. - process.wait() - raise - except: # Including KeyboardInterrupt, communicate handled that. - process.kill() - # We don't call process.wait() as .__exit__ does that for us. - raise - - retcode = process.poll() - if check and retcode: - raise CalledProcessError(retcode, process.args, - output=stdout, stderr=stderr) - finally: - # force kill any straggler subprocesses that were forked from the main proc - try: - os.killpg(pgid, signal.SIGINT) - except Exception: - pass - - return CompletedProcess(process.args, retcode, stdout, stderr) - - -@enforce_types -def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None: - """Safe atomic write to filesystem by writing to temp file + atomic rename""" - - mode = 'wb+' if isinstance(contents, bytes) else 'w' - encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes - - # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') - try: - with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f: - if isinstance(contents, dict): - dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) - elif isinstance(contents, (bytes, str)): - f.write(contents) - except OSError as e: - print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})") - print(" You can store the archive/ subfolder on a hard drive or network share that doesn't support support syncronous writes,") - print(" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.") - raise SystemExit(1) - os.chmod(path, int(OUTPUT_PERMISSIONS, base=8)) - -@enforce_types -def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) -> None: - """chmod -R /""" - - root = Path(cwd) / path - if not root.exists(): - raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) - - if not root.is_dir(): - os.chmod(root, int(OUTPUT_PERMISSIONS, base=8)) - else: - for subpath in Path(path).glob('**/*'): - os.chmod(subpath, int(OUTPUT_PERMISSIONS, base=8)) - - -@enforce_types -def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]): - """copy a given file or directory to a given path, overwriting the destination""" - if Path(from_path).is_dir(): - shutil.rmtree(to_path, ignore_errors=True) - shutil.copytree(from_path, to_path) - else: - with open(from_path, 'rb') as src: - contents = src.read() - atomic_write(to_path, contents) - - -@enforce_types -def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]: - """get the total disk size of a given directory, optionally summing up - recursively and limiting to a given filter list - """ - num_bytes, num_dirs, num_files = 0, 0, 0 - for entry in os.scandir(path): - if (pattern is not None) and (pattern not in entry.path): - continue - if entry.is_dir(follow_symlinks=False): - if not recursive: - continue - num_dirs += 1 - bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) - num_bytes += bytes_inside - num_dirs += dirs_inside - num_files += files_inside - else: - num_bytes += entry.stat(follow_symlinks=False).st_size - num_files += 1 - return num_bytes, num_dirs, num_files - - -CRON_COMMENT = 'archivebox_schedule' - - -@enforce_types -def dedupe_cron_jobs(cron: CronTab) -> CronTab: - deduped: Set[Tuple[str, str]] = set() - - for job in list(cron): - unique_tuple = (str(job.slices), job.command) - if unique_tuple not in deduped: - deduped.add(unique_tuple) - cron.remove(job) - - for schedule, command in deduped: - job = cron.new(command=command, comment=CRON_COMMENT) - job.setall(schedule) - job.enable() - - return cron - - -class suppress_output(object): - ''' - A context manager for doing a "deep suppression" of stdout and stderr in - Python, i.e. will suppress all print, even if the print originates in a - compiled C/Fortran sub-function. - This will not suppress raised exceptions, since exceptions are printed - to stderr just before a script exits, and after the context manager has - exited (at least, I think that is why it lets exceptions through). - - with suppress_stdout_stderr(): - rogue_function() - ''' - def __init__(self, stdout=True, stderr=True): - # Open a pair of null files - # Save the actual stdout (1) and stderr (2) file descriptors. - self.stdout, self.stderr = stdout, stderr - if stdout: - self.null_stdout = os.open(os.devnull, os.O_RDWR) - self.real_stdout = os.dup(1) - if stderr: - self.null_stderr = os.open(os.devnull, os.O_RDWR) - self.real_stderr = os.dup(2) - - def __enter__(self): - # Assign the null pointers to stdout and stderr. - if self.stdout: - os.dup2(self.null_stdout, 1) - if self.stderr: - os.dup2(self.null_stderr, 2) - - def __exit__(self, *_): - # Re-assign the real stdout/stderr back to (1) and (2) - if self.stdout: - os.dup2(self.real_stdout, 1) - os.close(self.null_stdout) - if self.stderr: - os.dup2(self.real_stderr, 2) - os.close(self.null_stderr) diff --git a/archivebox/templates/admin/actions.html b/archivebox/templates/admin/actions.html new file mode 100644 index 0000000000..66dc93aea0 --- /dev/null +++ b/archivebox/templates/admin/actions.html @@ -0,0 +1,65 @@ +{% load i18n %} +
    +
    + {% block actions %} + {% block actions-form %} + {% for field in action_form %} + {% if field.name == "tags" %} + {{ field }} + {% else %} + {% if field.label %}{% else %}{{ field }}{% endif %} + {% endif %} + {% endfor %} + {% endblock %} + {% block actions-submit %} + + {% endblock %} + {% block actions-counter %} + {% if actions_selection_counter %} + {{ selection_note }} + {% if cl.result_count != cl.result_list|length %} + + + + {% endif %} + {% endif %} + {% endblock %} + {% endblock %} +
    + {% if action_index|default:0 == 0 %} + {% if cl.has_filters or opts.model_name == 'snapshot' %} +
    + {% if cl.has_filters %} + + {% endif %} + {% if request.resolver_match.url_name == 'grid' %} + + {% elif opts.model_name == 'snapshot' %} + + {% endif %} +
    + {% endif %} + {% endif %} +
    diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 9dc625166e..1603e92625 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -1,4 +1,4 @@ -{% load i18n static tz %} +{% load i18n static tz core_tags %} {% get_current_language as LANGUAGE_CODE %} {% get_current_language_bidi as LANGUAGE_BIDI %} @@ -12,7 +12,1479 @@ {% endblock %} - {% block extrastyle %}{% endblock %} + {% api_token as api_token %} + + {% block extrastyle %} + + {% endblock %} {% if LANGUAGE_BIDI %} @@ -26,6 +1498,13 @@ {% endif %} {% endblock %} + + + + + {% endif %} + {% endcomment %} + + diff --git a/archivebox/templates/admin/change_list.html b/archivebox/templates/admin/change_list.html new file mode 100644 index 0000000000..a1c8eafce1 --- /dev/null +++ b/archivebox/templates/admin/change_list.html @@ -0,0 +1,175 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} + +{% block title %}{% if cl.formset and cl.formset.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} + +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% blocktranslate count counter=cl.formset.total_error_count %}Please correct the error below.{% plural %}Please correct the errors below.{% endblocktranslate %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} + {% if cl.model_admin.show_search_mode_selector %} + {% with current_search_mode=cl.params.search_mode|default:cl.model_admin.get_default_search_mode %} +
    + {% endwith %} + {% else %} +
    + {% endif %} +
    +
    + {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% result_list cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %} + + {% endblock %} +
    +
    +
    + {% block filters %} + {% if cl.has_filters %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.is_facets_optional or cl.has_active_filters %}
    + {% if cl.is_facets_optional %}

    + {% if cl.add_facets %}{% translate "Hide counts" %} + {% else %}{% translate "Show counts" %}{% endif %} +

    {% endif %} + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} +
    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} + {% endblock %} +
    +
    + {% if cl.has_filters %} + + {% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/change_list_results.html b/archivebox/templates/admin/change_list_results.html new file mode 100644 index 0000000000..71f410e382 --- /dev/null +++ b/archivebox/templates/admin/change_list_results.html @@ -0,0 +1,38 @@ +{% load i18n core_tags %} +{% if result_hidden_fields %} +
    {# DIV for HTML validation #} +{% for item in result_hidden_fields %}{{ item }}{% endfor %} +
    +{% endif %} +{% if results %} +
    + + + +{% for header in result_headers %} +{% endfor %} + + + +{% for result in results %} +{% if result.form and result.form.non_field_errors %} + +{% endif %} +{% with row_obj=cl.result_list|index:forloop.counter0 %} +{% for item in result %}{{ item }}{% endfor %} +{% endwith %} +{% endfor %} + +
    + {% if header.sortable and header.sort_priority > 0 %} +
    + + {% if num_sorted_fields > 1 %}{{ header.sort_priority }}{% endif %} + +
    + {% endif %} +
    {% if header.sortable %}{{ header.text|capfirst }}{% else %}{{ header.text|capfirst }}{% endif %}
    +
    +
    {{ result.form.non_field_errors }}
    +
    +{% endif %} diff --git a/archivebox/templates/admin/core/archiveresult/change_list.html b/archivebox/templates/admin/core/archiveresult/change_list.html new file mode 100644 index 0000000000..b44e92114b --- /dev/null +++ b/archivebox/templates/admin/core/archiveresult/change_list.html @@ -0,0 +1,142 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} + +{% block title %}{% if cl.formset and cl.formset.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + {% if not actions_on_top and not actions_on_bottom %} + + {% endif %} +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% if cl.formset.total_error_count == 1 %}{% translate "Please correct the error below." %}{% else %}{% translate "Please correct the errors below." %}{% endif %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} +
    +
    +
    + {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% result_list cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %} + + {% endblock %} +
    +
    +
    + {% if cl.has_filters %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} +
    +
    + {% if cl.has_filters %} + + {% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_form.html b/archivebox/templates/admin/core/tag/change_form.html new file mode 100644 index 0000000000..cde49905b1 --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_form.html @@ -0,0 +1,268 @@ +{% extends "admin/change_form.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
    +
    +

    {% if add %}New Tag{% else %}Edit Tag{% endif %}

    +

    Similar tags are shown below while typing.

    +
    +
    +
    + Matches + Current tags +
    +
    + Links + Open filtered snapshots +
    +
    +
    +{{ block.super }} +{% endblock %} + +{% block after_field_sets %} +{{ block.super }} +
    +

    Similar Tags

    +

    Updates while typing.

    +
    +
    + +{{ tag_similar_cards|json_script:"abx-tag-similar-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_list.html b/archivebox/templates/admin/core/tag/change_list.html new file mode 100644 index 0000000000..183826d04e --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_list.html @@ -0,0 +1,1009 @@ +{% extends "admin/change_list.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %} + +{% block object-tools %}{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block content %} +
    +
    +
    + + +
    +
    + + + +
    +
    + +
    +
    + {% csrf_token %} +
    + + +
    +
    +
    +
    + +
    +
    + {% if initial_tag_cards %} + {% for card in initial_tag_cards %} +
    +
    +
    + +
    + + + +
    +
    +
    + + + + + {{ card.num_snapshots }} +
    +
    +
    + {% if card.snapshots %} + {% for snapshot in card.snapshots %} + + + {{ snapshot.title }} + + {% endfor %} + {% else %} +
    No snapshots attached yet.
    + {% endif %} +
    +
    + {% endfor %} + {% else %} +
    No tags.
    + {% endif %} +
    +
    +
    + +{{ initial_tag_cards|json_script:"abx-tag-cards-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/personas/persona/change_form.html b/archivebox/templates/admin/personas/persona/change_form.html new file mode 100644 index 0000000000..262c66c96d --- /dev/null +++ b/archivebox/templates/admin/personas/persona/change_form.html @@ -0,0 +1,249 @@ +{% extends "admin/change_form.html" %} + +{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block extrahead %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
    +
    +

    Bootstrap a persona from a real browser session

    +

    + Pick a local Chromium profile, paste an absolute profile path, or attach to a live CDP endpoint. + The form saves the Persona normally, then imports profile files, cookies, and optional tab storage into + the Persona's own directories. +

    +
    +
    +
    + Detected profiles + {{ detected_profile_count }} +
    +
    + Persona artifacts + chrome_user_data + cookies.txt + auth.json +
    +
    +
    +{{ block.super }} +{% endblock %} diff --git a/archivebox/templates/admin/private_index.html b/archivebox/templates/admin/private_index.html index 7afb62c343..7db75b3032 100644 --- a/archivebox/templates/admin/private_index.html +++ b/archivebox/templates/admin/private_index.html @@ -1,62 +1,3 @@ -{% extends "base.html" %} -{% load static %} - -{% block body %} -
    - -
    - - - - - - - - - - - {% for link in object_list %} - {% include 'main_index_row.html' with link=link %} - {% endfor %} - -
    BookmarkedSnapshot ({{object_list|length}})FilesOriginal URL
    -
    - - {% if page_obj.has_previous %} - « first - previous - {% endif %} - - - Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}. - - - {% if page_obj.has_next %} - next - last » - {% endif %} - - - {% if page_obj.has_next %} - next - last » - {% endif %} - -
    -
    -{% endblock %} {% extends "admin/base_site.html" %} {% load i18n admin_urls static admin_list %} {% load core_tags %} @@ -137,7 +78,19 @@ {% block filters %} {% if cl.has_filters %}
    -

    {% translate 'Filter' %}

    +

    + {% translate 'Filter' %} + +

    {% if cl.has_active_filters %}

    ✖ {% translate "Clear all filters" %}

    {% endif %} @@ -147,4 +100,39 @@

    {% translate 'Filter' %}

    {% endblock %}
    + {% if cl.has_filters %} + + {% endif %} {% endblock %} diff --git a/archivebox/templates/admin/private_index_grid.html b/archivebox/templates/admin/private_index_grid.html index b60f3a3e79..7db75b3032 100644 --- a/archivebox/templates/admin/private_index_grid.html +++ b/archivebox/templates/admin/private_index_grid.html @@ -78,7 +78,19 @@ {% block filters %} {% if cl.has_filters %}
    -

    {% translate 'Filter' %}

    +

    + {% translate 'Filter' %} + +

    {% if cl.has_active_filters %}

    ✖ {% translate "Clear all filters" %}

    {% endif %} @@ -88,4 +100,39 @@

    {% translate 'Filter' %}

    {% endblock %}
    + {% if cl.has_filters %} + + {% endif %} {% endblock %} diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html new file mode 100644 index 0000000000..9d8667422d --- /dev/null +++ b/archivebox/templates/admin/progress_monitor.html @@ -0,0 +1,1144 @@ + + +
    +
    +
    +
    + + Runner stopped + +
    +
    +
    + Processes + 0 +
    +
    + Queued + 0 +
    +
    + Done + 0 +
    +
    + Failed + 0 +
    +
    +
    +
    + +
    +
    + +
    +
    No active crawls
    +
    +
    +
    + + diff --git a/archivebox/templates/admin/search_form.html b/archivebox/templates/admin/search_form.html new file mode 100644 index 0000000000..e386041c00 --- /dev/null +++ b/archivebox/templates/admin/search_form.html @@ -0,0 +1,43 @@ +{% load i18n static %} +{% if cl.search_fields %} +
    +

    {% blocktranslate with name=cl.opts.verbose_name_plural %}Search {{ name }}{% endblocktranslate %}

    +
    +{% endif %} diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html index d76e259737..16aa858ef9 100644 --- a/archivebox/templates/admin/snapshots_grid.html +++ b/archivebox/templates/admin/snapshots_grid.html @@ -126,6 +126,21 @@ .cards .card .card-info .timestamp { font-weight: 600; } + .cards .card .card-progress { + display: flex; + align-items: center; + gap: 6px; + padding: 4px 0; + } + .cards .card .card-progress .progress-text { + font-size: 11px; + color: #3b82f6; + font-weight: 500; + } + .cards .card.archiving { + border-color: #3b82f6; + box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.2); + } .cards .card .card-footer code { display: inline-block; width: 100%; @@ -145,19 +160,32 @@ {% block content %}
    {% for obj in results %} -
    +
    - - {{obj.added}} + + {{obj.bookmarked_at}} + {% if obj.status == 'started' %} +
    + + Archiving... +
    + {% else %} +
    + {{ obj.icons|safe }} +
    + {% endif %}
    - - {{obj.title|default:'Not yet archived...'}} + + {{obj.title|default:'Not yet archived...'}} {% endif %}
    - +

    {% if obj.is_archived %} - + {% else %} {% endif %} diff --git a/archivebox/templates/core/add.html b/archivebox/templates/core/add.html index 978567a3ab..198bafdf24 100644 --- a/archivebox/templates/core/add.html +++ b/archivebox/templates/core/add.html @@ -28,30 +28,1193 @@

    Add new URLs to your archive: results

      Add more URLs ➕ {% else %} +
    {% csrf_token %} -

    Add new URLs to your archive

    -
    - {{ form.as_p }}
    - +

    Create a new Crawl

    +
    +
    +

    + A Crawl is a job that processes URLs and creates Snapshots (archived copies) for each URL discovered. + The settings below apply to the entire crawl and all snapshots it creates. +

    +
    + + +
    +
    +
    +
    +
    + {{ form.url.label_tag }} +
    0 URLs detected
    +
    +
    + + {{ form.url }} +
    +
    + +
    + {% if form.url.errors %} +
    {{ form.url.errors }}
    + {% endif %} +
    + Enter URLs to archive, as one per line, CSV, JSON, or embedded in text (e.g. markdown, HTML, etc.). Examples:
    + https://example.com
    + https://news.ycombinator.com,https://news.google.com
    + [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox) +
    +
    + +
    + {{ form.tag.label_tag }} + {{ form.tag }} + {% if form.tag.errors %} +
    {{ form.tag.errors }}
    + {% endif %} +
    Tags will be applied to all snapshots created by this crawl.
    +
    + +
    +
    + {{ form.depth.label_tag }} + {{ form.depth }} + {% if form.depth.errors %} +
    {{ form.depth.errors }}
    + {% endif %} +
    Controls how many links deep the crawl will follow from the starting URLs.
    + +
    +
    + {{ form.max_urls.label_tag }} + {{ form.max_urls }} + {% if form.max_urls.errors %} +
    {{ form.max_urls.errors }}
    + {% endif %} +
    0 means unlimited. When set, only the first N filtered URLs will be snapshotted.
    +
    + +
    + {{ form.max_size.label_tag }} + {{ form.max_size }} + {% if form.max_size.errors %} +
    {{ form.max_size.errors }}
    + {% endif %} +
    0 means unlimited. Accepts bytes or units like 45mb and 1gb.
    +
    +
    +
    + +
    + {{ form.url_filters }} + {% if form.url_filters.errors %} +
    {{ form.url_filters.errors }}
    + {% endif %} +
    +
    + +
    + {{ form.notes.label_tag }} + {{ form.notes }} + {% if form.notes.errors %} +
    {{ form.notes.errors }}
    + {% endif %} +
    Optional description for this crawl (visible in the admin interface).
    +
    + +
    + {{ form.persona.label_tag }} + {{ form.persona }} + {% if form.persona.errors %} +
    {{ form.persona.errors }}
    + {% endif %} +
    + Authentication profile (Chrome profile, cookies, etc.) to use when accessing URLs. + Create new persona / import from Chrome → +
    +
    +
    + + +
    +

    Crawl Plugins

    +

    + Select which archiving methods to run for all snapshots in this crawl. If none selected, all available plugins will be used. + View plugin details → +

    + +
    + Quick Select: + + + + + +
    + +
    +
    +
    + + +
    +
    + {{ form.chrome_plugins }} +
    +
    + +
    +
    + +
    +
    + {{ form.archiving_plugins }} +
    +
    + +
    +
    + +
    +
    + {{ form.parsing_plugins }} +
    +
    + +
    +
    + + (defaults to SEARCH_BACKEND_ENGINE) +
    +
    + {{ form.search_plugins }} +
    +
    + +
    +
    + +
    +
    + {{ form.binary_plugins }} +
    +
    + +
    +
    + +
    +
    + {{ form.extension_plugins }} +
    +
    +
    +
    + + +
    +
    +

    Advanced Crawl Options

    +

    Additional settings that control how this crawl processes URLs and creates snapshots.

    + +
    + {{ form.schedule.label_tag }} + {{ form.schedule }} + {% if form.schedule.errors %} +
    {{ form.schedule.errors }}
    + {% endif %} +
    + Optional: Schedule this crawl to repeat automatically. Examples:
    + daily - Run once per day
    + weekly - Run once per week
    + 0 */6 * * * - Every 6 hours (cron format)
    + 0 0 * * 0 - Every Sunday at midnight (cron format) +
    +
    + +
    + {{ form.index_only }} + {{ form.index_only.label_tag }} + {% if form.index_only.errors %} +
    {{ form.index_only.errors }}
    + {% endif %} +
    Create the crawl and queue snapshots without running archive plugins yet.
    +
    + +
    + {{ form.config.label_tag }} + {{ form.config }} + {% if form.config.errors %} +
    {{ form.config.errors }}
    + {% endif %} +
    + Override any config option for this crawl (e.g., TIMEOUT, USER_AGENT, CHROME_BINARY, etc.). URL_ALLOWLIST, URL_DENYLIST, and ENABLED_PLUGINS are updated automatically from the fields above. +
    +
    +
    +
    + +
    +



    {% if absolute_add_path %} -
    + {% endif %} diff --git a/archivebox/templates/core/base.html b/archivebox/templates/core/base.html index d2268fd0c7..18ace420dd 100644 --- a/archivebox/templates/core/base.html +++ b/archivebox/templates/core/base.html @@ -1,4 +1,4 @@ -{% load static tz admin_urls %} +{% load static tz admin_urls core_tags %} @@ -9,6 +9,10 @@ + {% api_token as api_token %} + {% block extra_head %} @@ -38,8 +42,11 @@


    - Archive created using ArchiveBox version - v{{VERSION}}. + Archive created using ArchiveBox + v{{VERSION}} + {% if COMMIT_HASH %} + ({{COMMIT_HASH|truncatechars:9}}) + {% endif %}.

    {{FOOTER_INFO}}
    diff --git a/archivebox/templates/core/index_row.html b/archivebox/templates/core/index_row.html index 55c966aaa6..0b4aa265c7 100644 --- a/archivebox/templates/core/index_row.html +++ b/archivebox/templates/core/index_row.html @@ -1,17 +1,17 @@ {% load static tz core_tags %} - - {{ link.added|localtime }} + + {{ link.bookmarked_at|localtime }} {% if link.is_archived %} - + {% else %} - + {% endif %} - + {{link.title|default:'Loading...'|truncatechars:128}} @@ -29,14 +29,14 @@ {% if link.icons %} {{link.icons}}  {{link.num_outputs}} {% else %} - + 📄   {{link.num_outputs}} {% endif %} - + {{link.url}} diff --git a/archivebox/templates/core/navigation.html b/archivebox/templates/core/navigation.html index 90542f9fc9..479d631934 100644 --- a/archivebox/templates/core/navigation.html +++ b/archivebox/templates/core/navigation.html @@ -1,11 +1,12 @@ {% load i18n static %}
    - Add ➕     + Add ➕     Snapshots | Tags | Log     - Docs | + Docs | + API | Public | Admin     @@ -16,7 +17,7 @@ {% endblock %} {% block userlinks %} {% if user.has_usable_password %} - Account / + Account / {% endif %} {% trans 'Log out' %} {% endblock %} diff --git a/archivebox/templates/core/public_index.html b/archivebox/templates/core/public_index.html index 23ad5b21fe..4553591356 100644 --- a/archivebox/templates/core/public_index.html +++ b/archivebox/templates/core/public_index.html @@ -1,17 +1,32 @@ {% extends "base.html" %} {% load static tz %} +{% load core_tags %} {% block body %}
    -

    ") + assert "data-url-regex=" in body + assert 'id="url-highlight-layer"' in body + assert 'id="detected-urls-list"' in body + assert "detected-url-toggle-btn" in body + + +def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "sqlite") + + response = client.get(reverse("add"), HTTP_HOST=WEB_HOST) + body = response.content.decode() + + assert response.status_code == 200 + assert re.search( + r']* checked\b', + body, + ) + assert "const requiredSearchPlugin = 'search_backend_sqlite';" in body + + +def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + client.force_login(admin_user) + + response = client.post( + reverse("add"), + data={ + "url": "https://example.com\nhttps://cdn.example.com/asset.js", + "tag": "alpha,beta", + "depth": "1", + "max_urls": "3", + "max_size": "45mb", + "url_filters_allowlist": "example.com\n*.example.com", + "url_filters_denylist": "cdn.example.com", + "notes": "Created from /add/", + "schedule": "", + "persona": "Default", + "index_only": "", + "config": "{}", + }, + HTTP_HOST=WEB_HOST, + ) + + assert response.status_code == 302 + + crawl = Crawl.objects.order_by("-created_at").first() + assert crawl is not None + assert crawl.tags_str == "alpha,beta" + assert crawl.notes == "Created from /add/" + assert crawl.max_urls == 3 + assert crawl.max_size == 45 * 1024 * 1024 + assert crawl.config.get("DEFAULT_PERSONA") == "Default" + assert crawl.config["MAX_URLS"] == 3 + assert crawl.config["MAX_SIZE"] == 45 * 1024 * 1024 + assert crawl.config["URL_ALLOWLIST"] == "example.com\n*.example.com" + assert crawl.config["URL_DENYLIST"] == "cdn.example.com" + assert "OVERWRITE" not in crawl.config + assert "ONLY_NEW" not in crawl.config + + +def test_add_view_starts_background_runner_after_creating_crawl(client, admin_user, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + client.force_login(admin_user) + + runner_calls = [] + monkeypatch.setattr("archivebox.services.runner.ensure_background_runner", lambda: runner_calls.append(True) or True) + + response = client.post( + reverse("add"), + data={ + "url": "https://example.com", + "tag": "", + "depth": "0", + "max_urls": "0", + "max_size": "0", + "url_filters_allowlist": "", + "url_filters_denylist": "", + "notes": "", + "schedule": "", + "persona": "Default", + "index_only": "", + "config": "{}", + }, + HTTP_HOST=WEB_HOST, + ) + + assert response.status_code == 302 + assert runner_calls == [True] + + +def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + client.force_login(admin_user) + + response = client.post( + reverse("add"), + data={ + "url": "\n".join( + [ + "https://sweeting.me,https://google.com", + "Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com", + "[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))", + '{"items":["https://example.com/three"]}', + "csv,https://example.com/four", + ], + ), + "tag": "", + "depth": "0", + "max_urls": "0", + "max_size": "0", + "url_filters_allowlist": "", + "url_filters_denylist": "", + "notes": "", + "schedule": "", + "persona": "Default", + "index_only": "", + "config": "{}", + }, + HTTP_HOST=WEB_HOST, + ) + + assert response.status_code == 302 + + crawl = Crawl.objects.order_by("-created_at").first() + assert crawl is not None + assert crawl.urls == "\n".join( + [ + "https://sweeting.me", + "https://google.com", + "https://github.com/ArchiveBox/ArchiveBox", + "https://news.ycombinator.com", + "https://en.wikipedia.org/wiki/Classification_(machine_learning)", + "https://example.com/three", + "https://example.com/four", + ], + ) + + +def test_add_view_trims_trailing_punctuation_from_markdown_urls(client, admin_user, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + client.force_login(admin_user) + + response = client.post( + reverse("add"), + data={ + "url": "\n".join( + [ + "Docs: https://github.com/ArchiveBox/ArchiveBox.", + "Issue: https://github.com/abc?abc#234234?.", + ], + ), + "tag": "", + "depth": "0", + "max_urls": "0", + "max_size": "0", + "url_filters_allowlist": "", + "url_filters_denylist": "", + "notes": "", + "schedule": "", + "persona": "Default", + "index_only": "", + "config": "{}", + }, + HTTP_HOST=WEB_HOST, + ) + + assert response.status_code == 302 + + crawl = Crawl.objects.order_by("-created_at").first() + assert crawl is not None + assert crawl.urls == "\n".join( + [ + "https://github.com/ArchiveBox/ArchiveBox", + "https://github.com/abc?abc#234234", + ], + ) + + +def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch): + monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True) + client.force_login(admin_user) + + response = client.get(reverse("add"), HTTP_HOST=WEB_HOST) + + assert response.status_code == 200 + assert b"window.ARCHIVEBOX_API_KEY" in response.content + + +def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings): + settings.PUBLIC_SNAPSHOTS_LIST = False + settings.PUBLIC_INDEX = False + Tag.objects.create(name="archive") + + response = client.get( + reverse("api-1:tags_autocomplete"), + {"q": "a"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 401 + + +def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings): + settings.PUBLIC_SNAPSHOTS_LIST = True + settings.PUBLIC_INDEX = False + Tag.objects.create(name="archive") + + response = client.get( + reverse("api-1:tags_autocomplete"), + {"q": "a"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert response.json()["tags"][0]["name"] == "archive" + + +def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings): + settings.PUBLIC_SNAPSHOTS_LIST = False + settings.PUBLIC_INDEX = False + Tag.objects.create(name="archive") + client.force_login(admin_user) + + response = client.get( + reverse("api-1:tags_autocomplete"), + {"q": "a"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert response.json()["tags"][0]["name"] == "archive" diff --git a/archivebox/tests/test_admin_config_widget.py b/archivebox/tests/test_admin_config_widget.py new file mode 100644 index 0000000000..5fa6a176ad --- /dev/null +++ b/archivebox/tests/test_admin_config_widget.py @@ -0,0 +1,151 @@ +from archivebox.base_models.admin import KeyValueWidget + + +def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch): + monkeypatch.setattr( + KeyValueWidget, + "_get_config_options", + lambda self: { + "CHROME_WAIT_FOR": { + "plugin": "chrome", + "type": "string", + "default": "networkidle2", + "description": "Page load completion condition", + "enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"], + }, + }, + ) + + html = str( + KeyValueWidget().render( + "config", + {"CHROME_WAIT_FOR": "load"}, + attrs={"id": "id_config"}, + ), + ) + + assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html + assert 'class="kv-value-options"' in html + assert 'class="kv-help"' in html + assert "configureValueInput_id_config" in html + assert "describeMeta_id_config" in html + assert "validateValueAgainstMeta_id_config" in html + + +def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch): + monkeypatch.setattr( + KeyValueWidget, + "_get_config_options", + lambda self: { + "TIMEOUT": { + "plugin": "base", + "type": "integer", + "default": 60, + "description": "Timeout in seconds", + "minimum": 5, + "maximum": 120, + }, + "CHROME_RESOLUTION": { + "plugin": "chrome", + "type": "string", + "default": "1440,2000", + "description": "Viewport resolution", + "pattern": "^\\d+,\\d+$", + }, + }, + ) + + html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"})) + + assert '"minimum": 5' in html + assert '"maximum": 120' in html + assert '"pattern": "^\\\\d+,\\\\d+$"' in html + assert "Expected: " in html + assert "Example: " in html + assert "setValueValidationState_id_config" in html + assert "coerceValueForStorage_id_config" in html + + +def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch): + monkeypatch.setattr( + KeyValueWidget, + "_get_config_options", + lambda self: { + "DEBUG": { + "plugin": "base", + "type": "boolean", + "default": False, + "description": "Enable debug mode", + }, + }, + ) + + html = str(KeyValueWidget().render("config", {"DEBUG": "True"}, attrs={"id": "id_config"})) + + assert "enumValues = ['True', 'False']" in html + assert "raw.toLowerCase()" in html + assert "lowered === 'true' || raw === '1'" in html + assert "lowered === 'false' || raw === '0'" in html + + +def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch): + monkeypatch.setattr( + KeyValueWidget, + "_get_config_options", + lambda self: { + "WGET_ARGS_EXTRA": { + "plugin": "wget", + "type": "array", + "default": [], + "description": "Extra arguments to append to wget command", + }, + "SAVE_ALLOWLIST": { + "plugin": "base", + "type": "object", + "default": {}, + "description": "Regex allowlist mapped to enabled methods", + }, + "WGET_BINARY": { + "plugin": "wget", + "type": "string", + "default": "wget", + "description": "Path to wget binary", + }, + }, + ) + + html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"})) + + assert 'Example: ["--extra-arg"]' in html + assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html + assert "Example: wget or /usr/bin/wget" in html + assert "validateBinaryValue_id_config" in html + assert "meta.key.endsWith('_BINARY')" in html + assert "Binary paths cannot contain quotes" in html + + +def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch): + monkeypatch.setattr( + KeyValueWidget, + "_get_config_options", + lambda self: { + "CHROME_BINARY": { + "plugin": "base", + "type": "string", + "default": "", + "description": "Resolved Chromium/Chrome binary path shared across plugins", + }, + }, + ) + + html = str( + KeyValueWidget().render( + "config", + {"NODE_BINARY": "/opt/homebrew/bin/node"}, + attrs={"id": "id_config"}, + ), + ) + + assert "function getMetaForKey_id_config" in html + assert "if (key.endsWith('_BINARY'))" in html + assert "Path to binary executable" in html diff --git a/archivebox/tests/test_admin_links.py b/archivebox/tests/test_admin_links.py new file mode 100644 index 0000000000..22b09db04c --- /dev/null +++ b/archivebox/tests/test_admin_links.py @@ -0,0 +1,309 @@ +import pytest +from django.contrib.admin.sites import AdminSite +from django.test import RequestFactory +from django.urls import reverse +import html +from uuid import uuid4 + + +pytestmark = pytest.mark.django_db + + +def _create_snapshot(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + return Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + +def _create_machine(): + from archivebox.machine.models import Machine + + return Machine.objects.create( + guid=f"test-guid-{uuid4()}", + hostname="test-host", + hw_in_docker=False, + hw_in_vm=False, + hw_manufacturer="Test", + hw_product="Test Product", + hw_uuid=f"test-hw-{uuid4()}", + os_arch="arm64", + os_family="darwin", + os_platform="macOS", + os_release="14.0", + os_kernel="Darwin", + stats={}, + config={}, + ) + + +def _create_iface(machine): + from archivebox.machine.models import NetworkInterface + + return NetworkInterface.objects.create( + machine=machine, + mac_address="00:11:22:33:44:66", + ip_public="203.0.113.11", + ip_local="10.0.0.11", + dns_server="1.1.1.1", + hostname="test-host", + iface="en0", + isp="Test ISP", + city="Test City", + region="Test Region", + country="Test Country", + ) + + +def test_archiveresult_admin_links_plugin_and_process(): + from archivebox.core.admin_archiveresults import ArchiveResultAdmin + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Process + + snapshot = _create_snapshot() + iface = _create_iface(_create_machine()) + process = Process.objects.create( + machine=iface.machine, + iface=iface, + process_type=Process.TypeChoices.HOOK, + pwd=str(snapshot.output_dir / "wget"), + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.EXITED, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg.py", + process=process, + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + admin = ArchiveResultAdmin(ArchiveResult, AdminSite()) + + plugin_html = str(admin.plugin_with_icon(result)) + process_html = str(admin.process_link(result)) + + assert "/admin/environment/plugins/builtin.wget/" in plugin_html + assert f"/admin/machine/process/{process.id}/change" in process_html + + +def test_snapshot_admin_zip_links(): + from archivebox.core.admin_snapshots import SnapshotAdmin + from archivebox.core.models import Snapshot + + snapshot = _create_snapshot() + admin = SnapshotAdmin(Snapshot, AdminSite()) + + zip_url = admin.get_snapshot_zip_url(snapshot) + + assert html.escape(zip_url, quote=True) not in str(admin.files(snapshot)) + assert html.escape(zip_url, quote=True) in str(admin.size_with_stats(snapshot)) + assert html.escape(zip_url, quote=True) in str(admin.admin_actions(snapshot)) + + +def test_archiveresult_admin_zip_links(): + from archivebox.core.admin_archiveresults import ArchiveResultAdmin + from archivebox.core.models import ArchiveResult + + snapshot = _create_snapshot() + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg.py", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="Saved output", + ) + + admin = ArchiveResultAdmin(ArchiveResult, AdminSite()) + zip_url = admin.get_output_zip_url(result) + + assert html.escape(zip_url, quote=True) in str(admin.zip_link(result)) + assert html.escape(zip_url, quote=True) in str(admin.admin_actions(result)) + + +def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys(): + from archivebox.core.admin_archiveresults import ArchiveResultAdmin + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Process + + snapshot = _create_snapshot() + iface = _create_iface(_create_machine()) + process = Process.objects.create( + machine=iface.machine, + iface=iface, + process_type=Process.TypeChoices.HOOK, + pwd=str(snapshot.output_dir / "wget"), + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + env={ + "SAFE_FLAG": "1", + "API_KEY": "super-secret-key", + "ACCESS_TOKEN": "super-secret-token", + "SHARED_SECRET": "super-secret-secret", + }, + status=Process.StatusChoices.EXITED, + url="https://example.com", + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg.py", + process=process, + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + admin = ArchiveResultAdmin(ArchiveResult, AdminSite()) + cmd_html = str(admin.cmd_str(result)) + + assert "SAFE_FLAG=1" in cmd_html + assert "https://example.com" in cmd_html + assert "API_KEY" not in cmd_html + assert "ACCESS_TOKEN" not in cmd_html + assert "SHARED_SECRET" not in cmd_html + assert "super-secret-key" not in cmd_html + assert "super-secret-token" not in cmd_html + assert "super-secret-secret" not in cmd_html + + +def test_process_admin_links_binary_and_iface(): + from archivebox.machine.admin import ProcessAdmin + from archivebox.machine.models import Binary, Process + + machine = _create_machine() + iface = _create_iface(machine) + binary = Binary.objects.create( + machine=machine, + name="wget", + abspath="/usr/local/bin/wget", + version="1.21.2", + binprovider="env", + binproviders="env", + status=Binary.StatusChoices.INSTALLED, + ) + process = Process.objects.create( + machine=machine, + iface=iface, + binary=binary, + process_type=Process.TypeChoices.HOOK, + pwd="/tmp/wget", + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.EXITED, + ) + + admin = ProcessAdmin(Process, AdminSite()) + + binary_html = str(admin.binary_link(process)) + iface_html = str(admin.iface_link(process)) + + assert f"/admin/machine/binary/{binary.id}/change" in binary_html + assert f"/admin/machine/networkinterface/{iface.id}/change" in iface_html + + +def test_process_admin_kill_actions_only_terminate_running_processes(monkeypatch): + from archivebox.machine.admin import ProcessAdmin + from archivebox.machine.models import Process + + machine = _create_machine() + running = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + pwd="/tmp/running", + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.RUNNING, + ) + exited = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + pwd="/tmp/exited", + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.EXITED, + ) + + admin = ProcessAdmin(Process, AdminSite()) + request = RequestFactory().post("/admin/machine/process/") + + terminated = [] + flashed = [] + + monkeypatch.setattr(Process, "is_running", property(lambda self: self.pk == running.pk), raising=False) + monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True) + monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level))) + + admin.kill_processes(request, Process.objects.filter(pk__in=[running.pk, exited.pk]).order_by("created_at")) + + assert terminated == [running.pk] + assert any("Killed 1 running process" in msg for msg, _level in flashed) + assert any("Skipped 1 process" in msg for msg, _level in flashed) + + +def test_process_admin_object_kill_action_redirects_and_skips_exited(monkeypatch): + from archivebox.machine.admin import ProcessAdmin + from archivebox.machine.models import Process + + machine = _create_machine() + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + pwd="/tmp/exited", + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.EXITED, + ) + + admin = ProcessAdmin(Process, AdminSite()) + request = RequestFactory().post(f"/admin/machine/process/{process.pk}/change/") + + terminated = [] + flashed = [] + + monkeypatch.setattr(Process, "is_running", property(lambda self: False), raising=False) + monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True) + monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level))) + + response = admin.kill_process(request, process) + + assert response.status_code == 302 + assert response.url == reverse("admin:machine_process_change", args=[process.pk]) + assert terminated == [] + assert any("Skipped 1 process" in msg for msg, _level in flashed) + + +def test_process_admin_output_summary_uses_archiveresult_output_files(): + from archivebox.core.models import ArchiveResult + from archivebox.machine.admin import ProcessAdmin + from archivebox.machine.models import Process + + snapshot = _create_snapshot() + machine = _create_machine() + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + pwd=str(snapshot.output_dir / "wget"), + cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], + status=Process.StatusChoices.EXITED, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg.py", + process=process, + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_files={ + "index.html": {"extension": "html", "mimetype": "text/html", "size": 1024}, + "title.txt": {"extension": "txt", "mimetype": "text/plain", "size": "512"}, + }, + ) + + admin = ProcessAdmin(Process, AdminSite()) + + output_html = str(admin.output_summary(process)) + + assert "2 files" in output_html + assert "1.5 KB" in output_html diff --git a/archivebox/tests/test_admin_views.py b/archivebox/tests/test_admin_views.py new file mode 100644 index 0000000000..0bfc0158dd --- /dev/null +++ b/archivebox/tests/test_admin_views.py @@ -0,0 +1,1702 @@ +""" +Tests for admin snapshot views and search functionality. + +Tests cover: +- Admin snapshot list view +- Admin grid view +- Search functionality (both admin and public) +- Snapshot progress statistics +""" + +import json +import pytest +import uuid +from pathlib import Path +from types import SimpleNamespace +from typing import cast +from unittest.mock import patch +from django.test import override_settings +from django.test.client import RequestFactory +from django.urls import reverse +from django.contrib.auth import get_user_model +from django.contrib.auth.models import UserManager +from django.utils import timezone + +from archivebox.config.common import SEARCH_BACKEND_CONFIG + +pytestmark = pytest.mark.django_db + + +User = get_user_model() +ADMIN_HOST = "admin.archivebox.localhost:8000" +PUBLIC_HOST = "public.archivebox.localhost:8000" + + +@pytest.fixture +def admin_user(db): + """Create admin user for tests.""" + return cast(UserManager, User.objects).create_superuser( + username="testadmin", + email="admin@test.com", + password="testpassword", + ) + + +@pytest.fixture +def crawl(admin_user, db): + """Create test crawl.""" + from archivebox.crawls.models import Crawl + + return Crawl.objects.create( + urls="https://example.com", + created_by=admin_user, + ) + + +@pytest.fixture +def snapshot(crawl, db): + """Create test snapshot.""" + from archivebox.core.models import Snapshot + + return Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + +class TestSnapshotProgressStats: + """Tests for Snapshot.get_progress_stats() method.""" + + def test_get_progress_stats_empty(self, snapshot): + """Test progress stats with no archive results.""" + stats = snapshot.get_progress_stats() + + assert stats["total"] == 0 + assert stats["succeeded"] == 0 + assert stats["failed"] == 0 + assert stats["running"] == 0 + assert stats["pending"] == 0 + assert stats["percent"] == 0 + assert stats["output_size"] == 0 + assert stats["is_sealed"] is False + + def test_get_progress_stats_with_results(self, snapshot, db): + """Test progress stats with various archive result statuses.""" + from archivebox.core.models import ArchiveResult + + # Create some archive results + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status="succeeded", + output_size=1000, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="screenshot", + status="succeeded", + output_size=2000, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="pdf", + status="failed", + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="readability", + status="started", + ) + + stats = snapshot.get_progress_stats() + + assert stats["total"] == 4 + assert stats["succeeded"] == 2 + assert stats["failed"] == 1 + assert stats["running"] == 1 + assert stats["output_size"] == 3000 + assert stats["percent"] == 75 # (2 succeeded + 1 failed) / 4 total + + def test_snapshot_admin_progress_uses_expected_hook_total_not_observed_result_count(self, snapshot, monkeypatch): + from archivebox.core.admin_site import archivebox_admin + from archivebox.core.admin_snapshots import SnapshotAdmin + from archivebox.core.models import ArchiveResult, Snapshot + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__50_wget.py", + status="succeeded", + output_size=1000, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title.py", + status="started", + ) + + prefetched_snapshot = Snapshot.objects.prefetch_related("archiveresult_set").get(pk=snapshot.pk) + admin = SnapshotAdmin(Snapshot, archivebox_admin) + monkeypatch.setattr(admin, "_get_expected_hook_total", lambda obj: 5) + + stats = admin._get_progress_stats(prefetched_snapshot) + html = str(admin.status_with_progress(prefetched_snapshot)) + + assert stats["total"] == 5 + assert stats["succeeded"] == 1 + assert stats["running"] == 1 + assert stats["pending"] == 3 + assert stats["percent"] == 20 + assert "1/5 hooks" in html + + def test_get_progress_stats_sealed(self, snapshot): + """Test progress stats for sealed snapshot.""" + from archivebox.core.models import Snapshot + + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.save() + + stats = snapshot.get_progress_stats() + assert stats["is_sealed"] is True + + def test_archive_size_uses_prefetched_results_without_output_dir(self, snapshot, monkeypatch): + """archive_size should use prefetched ArchiveResult sizes before touching disk.""" + from archivebox.core.models import ArchiveResult, Snapshot + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status="succeeded", + output_size=4096, + ) + + prefetched_snapshot = Snapshot.objects.prefetch_related("archiveresult_set").get(pk=snapshot.pk) + + def _output_dir_should_not_be_used(self): + raise AssertionError("archive_size should not access Snapshot.output_dir when results are prefetched") + + monkeypatch.setattr(Snapshot, "output_dir", property(_output_dir_should_not_be_used), raising=False) + + assert prefetched_snapshot.archive_size == 4096 + + def test_snapshot_serialization_exposes_output_size_alias(self, snapshot): + """Snapshot serializers should expose output_size as an alias of archive_size.""" + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status="succeeded", + output_size=4096, + ) + + assert snapshot.to_dict()["archive_size"] == 4096 + assert snapshot.to_dict()["output_size"] == 4096 + assert snapshot.to_dict()["status"] == snapshot.status + assert snapshot.to_json()["archive_size"] == 4096 + assert snapshot.to_json()["output_size"] == 4096 + assert snapshot.to_csv(cols=["output_size"]) == "4096" + assert snapshot.to_csv(cols=["status"]) == '"started"' + + def test_is_archived_true_for_sealed_snapshot_without_legacy_output_paths(self, snapshot, monkeypatch): + """Sealed snapshots should count as archived without relying on legacy output filenames.""" + from archivebox.core.models import Snapshot + + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.save(update_fields=["status", "modified_at"]) + + def _missing_output_dir(self): + return Path("/definitely/missing") + + monkeypatch.setattr(Snapshot, "output_dir", property(_missing_output_dir), raising=False) + + assert snapshot.is_archived is True + + def test_discover_outputs_uses_output_file_metadata_size(self, snapshot): + """discover_outputs should use output_files metadata before filesystem fallbacks.""" + from archivebox.core.models import ArchiveResult + + output_dir = Path(snapshot.output_dir) / "ytdlp" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "video.mp4").write_bytes(b"video") + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="ytdlp", + status="succeeded", + output_str="", + output_files={"video.mp4": {"size": 9876, "mimetype": "video/mp4", "extension": "mp4"}}, + output_size=0, + ) + + outputs = snapshot.discover_outputs(include_filesystem_fallback=False) + ytdlp_output = next(output for output in outputs if output["name"] == "ytdlp") + + assert ytdlp_output["path"] == "ytdlp/video.mp4" + assert ytdlp_output["size"] == 9876 + + def test_media_helpers_use_output_file_metadata_without_disk(self): + """Template helpers should derive media lists and sizes from output_files metadata.""" + from archivebox.core.templatetags.core_tags import _count_media_files, _list_media_files + + result = SimpleNamespace( + output_files={ + "video.mp4": {"size": 111, "mimetype": "video/mp4", "extension": "mp4"}, + "audio.mp3": {"size": 222, "mimetype": "audio/mpeg", "extension": "mp3"}, + }, + snapshot_dir="/tmp/does-not-need-to-exist", + plugin="ytdlp", + ) + + assert _count_media_files(result) == 2 + assert _list_media_files(result) == [ + {"name": "audio.mp3", "path": "ytdlp/audio.mp3", "size": 222}, + {"name": "video.mp4", "path": "ytdlp/video.mp4", "size": 111}, + ] + + def test_discover_outputs_falls_back_to_hashes_index_without_filesystem_walk(self, snapshot, monkeypatch): + """Older snapshots can still render cards from hashes.json when DB output_files are missing.""" + from archivebox.core.models import ArchiveResult, Snapshot + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="responses", + status="succeeded", + output_str="141 responses", + output_files={}, + ) + + monkeypatch.setattr( + Snapshot, + "hashes_index", + property( + lambda self: { + "responses/index.jsonl": {"size": 456}, + "responses/all/20260323T073504__GET__example.com__.html": {"size": 789}, + "responses/all/20260323T073504__GET__example.com__app.js": {"size": 123}, + }, + ), + raising=False, + ) + + outputs = snapshot.discover_outputs(include_filesystem_fallback=False) + + assert next(output for output in outputs if output["name"] == "responses")["path"] == ( + "responses/all/20260323T073504__GET__example.com__.html" + ) + + def test_discover_outputs_falls_back_to_filesystem_for_missing_db_and_hashes(self, snapshot, monkeypatch): + """Snapshot page can still recover cards from plugin dirs when DB metadata is missing.""" + from archivebox.core.models import Snapshot + + monkeypatch.setattr(Snapshot, "hashes_index", property(lambda self: {}), raising=False) + + responses_dir = Path(snapshot.output_dir) / "responses" + (responses_dir / "all").mkdir(parents=True, exist_ok=True) + (responses_dir / "index.jsonl").write_text("{}", encoding="utf-8") + (responses_dir / "all" / "20260323T073504__GET__example.com__.html").write_text("ok", encoding="utf-8") + + outputs = snapshot.discover_outputs(include_filesystem_fallback=True) + + assert next(output for output in outputs if output["name"] == "responses")["path"] == ( + "responses/all/20260323T073504__GET__example.com__.html" + ) + + def test_embed_path_db_ignores_human_readable_output_messages(self, snapshot): + from archivebox.core.models import ArchiveResult + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="singlefile", + status="failed", + output_str="SingleFile extension did not produce output", + ) + + assert result.embed_path_db() is None + + def test_embed_path_db_prefers_valid_output_str_over_first_output_file(self, snapshot): + from archivebox.core.models import ArchiveResult + + output_dir = Path(snapshot.output_dir) / "wget" / "example.com" / "assets" / "css" + output_dir.mkdir(parents=True, exist_ok=True) + (Path(snapshot.output_dir) / "wget" / "example.com" / "index.html").parent.mkdir(parents=True, exist_ok=True) + (Path(snapshot.output_dir) / "wget" / "example.com" / "index.html").write_text("ok", encoding="utf-8") + (output_dir / "mobile.css").write_text("body {}", encoding="utf-8") + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status="succeeded", + output_str="wget/example.com/index.html", + output_files={ + "example.com/assets/css/mobile.css": {"size": 123, "mimetype": "text/css"}, + "example.com/index.html": {"size": 456, "mimetype": "text/html"}, + }, + ) + + assert result.embed_path_db() == "wget/example.com/index.html" + + def test_embed_path_db_scores_output_files_instead_of_using_first_entry(self, snapshot): + from archivebox.core.models import ArchiveResult + + output_dir = Path(snapshot.output_dir) / "wget" / "example.com" / "assets" / "css" + output_dir.mkdir(parents=True, exist_ok=True) + (Path(snapshot.output_dir) / "wget" / "example.com" / "index.html").parent.mkdir(parents=True, exist_ok=True) + (Path(snapshot.output_dir) / "wget" / "example.com" / "index.html").write_text("ok", encoding="utf-8") + (output_dir / "mobile.css").write_text("body {}", encoding="utf-8") + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status="succeeded", + output_str="", + output_files={ + "example.com/assets/css/mobile.css": {"size": 123, "mimetype": "text/css"}, + "example.com/index.html": {"size": 456, "mimetype": "text/html"}, + }, + ) + + assert result.embed_path_db() == "wget/example.com/index.html" + + def test_embed_path_db_rejects_mimetype_like_output_str(self, snapshot): + from archivebox.core.models import ArchiveResult + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="staticfile", + status="succeeded", + output_str="text/html", + ) + + assert result.embed_path_db() is None + + def test_embed_path_db_rejects_output_str_that_does_not_exist_on_disk(self, snapshot): + from archivebox.core.models import ArchiveResult + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="dns", + status="succeeded", + output_str="1.2.3.4", + ) + + assert result.embed_path_db() is None + + def test_embed_path_db_uses_output_file_fallbacks_without_disk_check(self, snapshot): + from archivebox.core.models import ArchiveResult + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="responses", + status="succeeded", + output_str="", + output_files={ + "all/20260323T073504__GET__example.com__.html": {"size": 789, "mimetype": "text/html"}, + }, + ) + + assert result.embed_path_db() == "responses/all/20260323T073504__GET__example.com__.html" + + def test_discover_outputs_keeps_jsonl_only_plugins_with_non_path_output_str(self, snapshot): + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="dns", + status="succeeded", + output_str="1.2.3.4", + output_files={"dns.jsonl": {"size": 1519, "mimetype": "application/jsonl"}}, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="sslcerts", + status="succeeded", + output_str="WR2", + output_files={"sslcerts.jsonl": {"size": 3138, "mimetype": "application/jsonl"}}, + ) + + outputs = {output["name"]: output for output in snapshot.discover_outputs(include_filesystem_fallback=False)} + + assert outputs["dns"]["path"] == "dns/dns.jsonl" + assert outputs["sslcerts"]["path"] == "sslcerts/sslcerts.jsonl" + assert outputs["dns"]["is_metadata"] is True + assert outputs["sslcerts"]["is_metadata"] is True + + def test_embed_path_uses_explicit_fallback_not_first_output_file(self, snapshot): + from archivebox.core.models import ArchiveResult + + output_dir = Path(snapshot.output_dir) / "responses" / "all" + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "20260323T073504__GET__example.com__.html").write_text("ok", encoding="utf-8") + + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="responses", + status="succeeded", + output_str="141 responses", + output_files={ + "all/20260323T073504__GET__example.com__app.js": {"size": 123, "mimetype": "application/javascript"}, + "all/20260323T073504__GET__example.com__.html": {"size": 789, "mimetype": "text/html"}, + "index.jsonl": {"size": 456, "mimetype": "application/jsonl"}, + }, + ) + + assert result.embed_path_db() == "responses/all/20260323T073504__GET__example.com__.html" + assert result.embed_path() == "responses/all/20260323T073504__GET__example.com__.html" + + def test_detail_page_auxiliary_items_include_failed_plugins(self, snapshot): + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="singlefile", + status=ArchiveResult.StatusChoices.FAILED, + output_str="SingleFile extension did not produce output", + ) + + loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs=[]) + + assert loose_items == [] + assert failed_items == [ + { + "name": "singlefile (failed)", + "path": "singlefile", + "is_dir": True, + "size": 0, + }, + ] + + def test_detail_page_auxiliary_items_include_hidden_failed_plugins(self, snapshot): + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="favicon", + status=ArchiveResult.StatusChoices.FAILED, + output_str="No favicon found", + ) + + _, failed_items = snapshot.get_detail_page_auxiliary_items(outputs=[], hidden_card_plugins={"favicon"}) + + assert failed_items == [ + { + "name": "favicon (failed)", + "path": "favicon", + "is_dir": True, + "size": 0, + }, + ] + + def test_detail_page_auxiliary_items_exclude_noresults_and_skipped(self, snapshot): + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + status=ArchiveResult.StatusChoices.NORESULTS, + output_str="No title found", + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="favicon", + status=ArchiveResult.StatusChoices.SKIPPED, + output_str="Skipped", + ) + + _, failed_items = snapshot.get_detail_page_auxiliary_items(outputs=[]) + + assert failed_items == [] + + def test_plugin_full_prefers_db_embed_path_over_empty_filesystem_embed_path(self, monkeypatch): + from archivebox.core.templatetags import core_tags + + result = SimpleNamespace( + plugin="title", + snapshot=SimpleNamespace(), + snapshot_id="019d191c-5e42-77fc-b5b6-ffa4215f6d64", + embed_path_db=lambda: "title/title.txt", + embed_path=lambda: None, + ) + + monkeypatch.setattr(core_tags, "get_plugin_template", lambda plugin, view: "{{ output_path }}") + + html = str(core_tags.plugin_full({"request": None}, result)) + + assert "title/title.txt" in html + assert "?preview=1" not in html + assert html != "http://snap-ffa4215f6d64.archivebox.localhost:8000" + + def test_plugin_full_returns_empty_for_none_result(self): + from archivebox.core.templatetags import core_tags + + assert core_tags.plugin_full({"request": None}, None) == "" + + def test_write_html_details_succeeds_with_index_only_fallback_output(self, snapshot): + output_dir = Path(snapshot.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "index.jsonl").write_text('{"type":"Snapshot"}\n', encoding="utf-8") + + snapshot.write_html_details() + + rendered = (output_dir / "index.html").read_text(encoding="utf-8") + + assert "full-page-iframe" in rendered + assert "index.jsonl?preview=1" in rendered + + +class TestAdminSnapshotListView: + """Tests for the admin snapshot list view.""" + + def test_list_view_renders(self, client, admin_user): + """Test that the list view renders successfully.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + def test_list_view_with_snapshots(self, client, admin_user, snapshot): + """Test list view with snapshots displays them.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"example.com" in response.content + + def test_list_view_avoids_legacy_title_fallbacks(self, client, admin_user, snapshot, monkeypatch): + """Title-less snapshots should render without touching history-based fallback paths.""" + from archivebox.core.models import Snapshot + + Snapshot.objects.filter(pk=snapshot.pk).update(title="") + + def _latest_title_should_not_be_used(self): + raise AssertionError("admin changelist should not access Snapshot.latest_title") + + def _history_should_not_be_used(self): + raise AssertionError("admin changelist should not access Snapshot.history") + + monkeypatch.setattr(Snapshot, "latest_title", property(_latest_title_should_not_be_used), raising=False) + monkeypatch.setattr(Snapshot, "history", property(_history_should_not_be_used), raising=False) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"example.com" in response.content + + def test_live_progress_excludes_old_archiveresults_from_previous_snapshot_run(self, client, admin_user, crawl, snapshot): + from datetime import timedelta + from archivebox.core.models import ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + client.login(username="testadmin", password="testpassword") + + now = timezone.now() + Crawl.objects.filter(pk=crawl.pk).update( + status=Crawl.StatusChoices.STARTED, + retry_at=now, + modified_at=now, + ) + Snapshot.objects.filter(pk=snapshot.pk).update( + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + downloaded_at=now - timedelta(minutes=1), + modified_at=now, + ) + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status=ArchiveResult.StatusChoices.SUCCEEDED, + start_ts=now - timedelta(hours=1, minutes=1), + end_ts=now - timedelta(hours=1), + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + response = client.get("/admin/live-progress/", HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(item for item in payload["active_crawls"] if item["id"] == str(crawl.pk)) + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.pk)) + plugin_names = [item["plugin"] for item in active_snapshot["all_plugins"]] + assert plugin_names == ["chrome"] + + def test_live_progress_does_not_hide_active_snapshot_results_when_modified_at_moves(self, client, admin_user, crawl, snapshot): + from datetime import timedelta + from archivebox.core.models import ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + client.login(username="testadmin", password="testpassword") + + now = timezone.now() + Crawl.objects.filter(pk=crawl.pk).update( + status=Crawl.StatusChoices.STARTED, + retry_at=now, + modified_at=now, + ) + Snapshot.objects.filter(pk=snapshot.pk).update( + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + created_at=now - timedelta(hours=2), + modified_at=now, + downloaded_at=None, + ) + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status=ArchiveResult.StatusChoices.STARTED, + start_ts=now - timedelta(minutes=5), + ) + + response = client.get("/admin/live-progress/", HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(item for item in payload["active_crawls"] if item["id"] == str(crawl.pk)) + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.pk)) + plugin_names = [item["plugin"] for item in active_snapshot["all_plugins"]] + assert plugin_names == ["wget"] + + def test_list_view_avoids_output_dir_lookups(self, client, admin_user, snapshot, monkeypatch): + """Changelist links should render without probing snapshot paths on disk.""" + from archivebox.core.models import Snapshot + + def _output_dir_should_not_be_used(self): + raise AssertionError("admin changelist should not access Snapshot.output_dir") + + monkeypatch.setattr(Snapshot, "output_dir", property(_output_dir_should_not_be_used), raising=False) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"example.com" in response.content + + def test_list_view_avoids_snapshot_icons_helper(self, client, admin_user, snapshot, monkeypatch): + """Changelist should not call Snapshot.icons for each row anymore.""" + from archivebox.core.models import ArchiveResult, Snapshot + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_files={"index.html": {"size": 123, "extension": "html"}}, + ) + + def _icons_should_not_be_used(self, path=None): + raise AssertionError("admin changelist should not call Snapshot.icons") + + monkeypatch.setattr(Snapshot, "icons", _icons_should_not_be_used, raising=True) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"wget" in response.content + + def test_list_view_uses_prefetched_tags_without_row_queries(self, client, admin_user, crawl, db): + """Changelist tag rendering should reuse the prefetched tag cache.""" + from django.db import connection + from django.test.utils import CaptureQueriesContext + from archivebox.core.models import Snapshot, Tag + + tags = [Tag.objects.create(name=f"perf-tag-{idx}") for idx in range(3)] + for idx in range(5): + snap = Snapshot.objects.create( + url=f"https://example.com/{idx}", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + title=f"Title {idx}", + ) + snap.tags.add(*tags[: (idx % 3) + 1]) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + with CaptureQueriesContext(connection) as ctx: + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + per_row_tag_queries = [ + query["sql"] + for query in ctx.captured_queries + if 'FROM "core_tag"' in query["sql"] and '"core_snapshot_tags"."snapshot_id"' in query["sql"] and " IN " not in query["sql"] + ] + assert per_row_tag_queries == [] + + def test_grid_view_renders(self, client, admin_user): + """Test that the grid view renders successfully.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:grid") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + def test_view_mode_switcher_present(self, client, admin_user): + """Test that view mode switcher is present.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + # Check for visible snapshot actions-bar controls + assert b"snapshot-view-toggle" in response.content + assert b"Grid" in response.content + assert reverse("admin:grid").encode() in response.content + + def test_binary_change_view_renders(self, client, admin_user, db): + """Binary admin change form should load without FieldError.""" + from archivebox.machine.models import Machine, Binary + + machine = Machine.objects.create( + guid=f"test-guid-{uuid.uuid4()}", + hostname="test-host", + hw_in_docker=False, + hw_in_vm=False, + hw_manufacturer="Test", + hw_product="Test Product", + hw_uuid=f"test-hw-{uuid.uuid4()}", + os_arch="x86_64", + os_family="darwin", + os_platform="darwin", + os_release="test", + os_kernel="test-kernel", + stats={}, + ) + binary = Binary.objects.create( + machine=machine, + name="gallery-dl", + binproviders="env", + binprovider="env", + abspath="/opt/homebrew/bin/gallery-dl", + version="1.26.9", + sha256="abc123", + status=Binary.StatusChoices.INSTALLED, + ) + + client.login(username="testadmin", password="testpassword") + url = f"/admin/machine/binary/{binary.pk}/change/" + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"gallery-dl" in response.content + + def test_process_change_view_renders_copyable_cmd_env_and_readonly_runtime_fields(self, client, admin_user, db): + from datetime import timedelta + from archivebox.machine.models import Machine, Process + + machine = Machine.objects.create( + guid=f"test-guid-{uuid.uuid4()}", + hostname="test-host", + hw_in_docker=False, + hw_in_vm=False, + hw_manufacturer="Test", + hw_product="Test Product", + hw_uuid=f"test-hw-{uuid.uuid4()}", + os_arch="x86_64", + os_family="darwin", + os_platform="darwin", + os_release="test", + os_kernel="test-kernel", + stats={}, + ) + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.EXITED, + pwd="/tmp/archivebox", + cmd=["python", "/tmp/job.py", "--url=https://example.com"], + env={ + "ENABLED": True, + "API_KEY": "super-secret-key", + "ACCESS_TOKEN": "super-secret-token", + "SHARED_SECRET": "super-secret-secret", + }, + timeout=90, + pid=54321, + exit_code=0, + url="https://example.com/status", + started_at=timezone.now() - timedelta(seconds=52), + ended_at=timezone.now(), + ) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:machine_process_change", args=[process.pk]) + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Kill" in response.content + assert b"python /tmp/job.py --url=https://example.com" in response.content + assert b"ENABLED=True" in response.content + assert b"52s" in response.content + assert b"API_KEY=" not in response.content + assert b"ACCESS_TOKEN=" not in response.content + assert b"SHARED_SECRET=" not in response.content + assert b"super-secret-key" not in response.content + assert b"super-secret-token" not in response.content + assert b"super-secret-secret" not in response.content + assert response.content.count(b"data-command=") >= 2 + assert b'name="timeout"' not in response.content + assert b'name="pid"' not in response.content + assert b'name="exit_code"' not in response.content + assert b'name="url"' not in response.content + assert b'name="started_at"' not in response.content + assert b'name="ended_at"' not in response.content + + def test_process_list_view_shows_duration_snapshot_and_crawl_columns(self, client, admin_user, snapshot, db): + from datetime import timedelta + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Machine, Process + + machine = Machine.objects.create( + guid=f"list-guid-{uuid.uuid4()}", + hostname="list-host", + hw_in_docker=False, + hw_in_vm=False, + hw_manufacturer="Test", + hw_product="Test Product", + hw_uuid=f"list-hw-{uuid.uuid4()}", + os_arch="x86_64", + os_family="darwin", + os_platform="darwin", + os_release="test", + os_kernel="test-kernel", + stats={}, + ) + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.EXITED, + pwd="/tmp/archivebox", + cmd=["python", "/tmp/job.py"], + env={}, + pid=12345, + exit_code=0, + started_at=timezone.now() - timedelta(milliseconds=10), + ended_at=timezone.now(), + ) + ArchiveResult.objects.create( + snapshot=snapshot, + process=process, + plugin="title", + hook_name="on_Snapshot__54_title", + status="succeeded", + output_str="Example Domain", + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:machine_process_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Duration" in response.content + assert b"Snapshot" in response.content + assert b"Crawl" in response.content + assert b"0.01s" in response.content + changelist = response.context["cl"] + row = next(obj for obj in changelist.result_list if obj.pk == process.pk) + + assert row.archiveresult.snapshot_id == snapshot.id + assert str(snapshot.id) in str(changelist.model_admin.snapshot_link(row)) + assert str(snapshot.crawl_id) in str(changelist.model_admin.crawl_link(row)) + + def test_change_view_renders_real_redo_failed_action(self, client, admin_user, snapshot): + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_change", args=[snapshot.pk]) + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert f"/admin/core/snapshot/{snapshot.pk}/redo-failed/".encode() in response.content + + def test_snapshot_view_url_uses_canonical_replay_url_for_mode(self, snapshot, monkeypatch): + from archivebox.config.common import SERVER_CONFIG + from archivebox.core.admin_site import archivebox_admin + from archivebox.core.admin_snapshots import SnapshotAdmin + + admin = SnapshotAdmin(snapshot.__class__, archivebox_admin) + + monkeypatch.setattr(SERVER_CONFIG, "SERVER_SECURITY_MODE", "safe-subdomains-fullreplay") + assert admin.get_snapshot_view_url(snapshot) == f"http://snap-{str(snapshot.pk).replace('-', '')[-12:]}.archivebox.localhost:8000" + + monkeypatch.setattr(SERVER_CONFIG, "SERVER_SECURITY_MODE", "safe-onedomain-nojsreplay") + assert admin.get_snapshot_view_url(snapshot) == f"http://archivebox.localhost:8000/snapshot/{snapshot.pk}" + + def test_find_snapshots_for_url_matches_fragment_suffixed_variants(self, crawl, db): + from archivebox.core.models import Snapshot + from archivebox.core.views import SnapshotView + + canonical = Snapshot.objects.create( + url="https://example.com/page", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + old_variant = Snapshot.objects.create( + url="https://example.com/page#2026-03-23T12:34:56", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + matches = list(SnapshotView.find_snapshots_for_url(canonical.url).order_by("url")) + + assert [snap.url for snap in matches] == [canonical.url, old_variant.url] + + def test_change_view_renders_readonly_tag_pills_near_title(self, client, admin_user, snapshot): + from archivebox.core.models import Tag + + tag = Tag.objects.create(name="Alpha Research") + snapshot.tags.add(tag) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_change", args=[snapshot.pk]) + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Alpha Research" in response.content + assert b"tag-editor-inline readonly" in response.content + assert b'data-readonly="1"' in response.content + + def test_redo_failed_action_requeues_snapshot(self, client, admin_user, snapshot): + from archivebox.core.models import ArchiveResult + + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title", + status=ArchiveResult.StatusChoices.FAILED, + output_str="boom", + ) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_redo_failed", args=[snapshot.pk]) + response = client.post(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 302 + assert response["Location"].endswith(f"/admin/core/snapshot/{snapshot.pk}/change/") + assert snapshot.archiveresult_set.get(plugin="title").status == ArchiveResult.StatusChoices.QUEUED + + def test_archive_now_action_uses_original_snapshot_url_without_timestamp_suffix(self, client, admin_user, snapshot, monkeypatch): + import archivebox.core.admin_snapshots as admin_snapshots + + snapshot.url = "https://example.com/path#section-1" + snapshot.save(update_fields=["url"]) + + queued = [] + + def fake_bg_add(payload): + queued.append(payload) + + monkeypatch.setattr(admin_snapshots, "bg_add", fake_bg_add) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.post( + url, + { + "action": "resnapshot_snapshot", + "_selected_action": [str(snapshot.pk)], + "index": "0", + }, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 302 + assert queued == [{"urls": "https://example.com/path#section-1"}] + + def test_archive_now_action_groups_multiple_snapshots_into_one_crawl(self, client, admin_user, snapshot, monkeypatch): + import archivebox.core.admin_snapshots as admin_snapshots + from archivebox.core.models import Snapshot + + other_snapshot = Snapshot.objects.create( + url="https://example.com/other#frag", + crawl=snapshot.crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + queued = [] + + def fake_bg_add(payload): + queued.append(payload) + + monkeypatch.setattr(admin_snapshots, "bg_add", fake_bg_add) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.post( + url, + { + "action": "resnapshot_snapshot", + "_selected_action": [str(snapshot.pk), str(other_snapshot.pk)], + "index": "0", + }, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 302 + assert len(queued) == 1 + assert set(queued[0]["urls"].splitlines()) == {"https://example.com", "https://example.com/other#frag"} + + def test_change_view_archiveresults_inline_shows_process_and_machine_links(self, client, admin_user, snapshot, db): + import archivebox.machine.models as machine_models + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.EXITED, + pid=54321, + exit_code=0, + cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"], + env={"EXTRA_CONTEXT": json.dumps({"snapshot_id": str(snapshot.id)})}, + started_at=timezone.now(), + ended_at=timezone.now(), + ) + ArchiveResult.objects.create( + snapshot=snapshot, + process=process, + plugin="title", + hook_name="on_Snapshot__54_title.js", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="Example Domain", + ) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_change", args=[snapshot.pk]) + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Process" in response.content + assert b"Machine" in response.content + assert b"54321" in response.content + assert machine.hostname.encode() in response.content + assert reverse("admin:machine_process_change", args=[process.id]).encode() in response.content + assert reverse("admin:machine_machine_change", args=[machine.id]).encode() in response.content + + +class TestCrawlScheduleAdmin: + def test_crawlschedule_add_view_renders_and_saves(self, client, admin_user, crawl): + from archivebox.crawls.models import CrawlSchedule + + client.login(username="testadmin", password="testpassword") + + add_url = reverse("admin:crawls_crawlschedule_add") + get_response = client.get(add_url, HTTP_HOST=ADMIN_HOST) + + assert get_response.status_code == 200 + assert b"Schedule Info" in get_response.content + assert b"No Crawls yet..." not in get_response.content + assert b"No Snapshots yet..." not in get_response.content + + post_response = client.post( + add_url, + { + "label": "Nightly crawl", + "notes": "", + "schedule": "0 0 * * *", + "template": str(crawl.pk), + "created_by": str(admin_user.pk), + "_save": "Save", + }, + HTTP_HOST=ADMIN_HOST, + ) + + assert post_response.status_code == 302 + schedule = CrawlSchedule.objects.get(label="Nightly crawl") + assert schedule.template_id == crawl.pk + assert schedule.created_by_id == admin_user.pk + + def test_crawlschedule_changelist_renders_snapshot_counts(self, client, admin_user, crawl, snapshot): + from archivebox.crawls.models import CrawlSchedule + + schedule = CrawlSchedule.objects.create( + label="Daily crawl", + notes="", + schedule="0 0 * * *", + template=crawl, + created_by=admin_user, + ) + crawl.schedule = schedule + crawl.save(update_fields=["schedule"]) + snapshot.crawl = crawl + snapshot.save(update_fields=["crawl"]) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:crawls_crawlschedule_changelist") + response = client.get(url, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Daily crawl" in response.content + + +class TestArchiveResultAdminListView: + def test_list_view_renders_readonly_tags_and_noresults_status(self, client, admin_user, snapshot): + from archivebox.core.models import ArchiveResult, Tag + + tag = Tag.objects.create(name="Alpha Research") + snapshot.tags.add(tag) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + status=ArchiveResult.StatusChoices.NORESULTS, + output_str="No title found", + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:core_archiveresult_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Alpha Research" in response.content + assert b"tag-editor-inline readonly" in response.content + assert b"No Results" in response.content + + def test_api_token_admin_list_view_renders(self, client, admin_user): + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:api_apitoken_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"API Keys" in response.content + + def test_user_admin_list_view_renders(self, client, admin_user): + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:auth_user_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Select user to change" in response.content + + def test_archiveresult_model_has_no_retry_at_field(self): + from archivebox.core.models import ArchiveResult + + assert "retry_at" not in {field.name for field in ArchiveResult._meta.fields} + + +class TestLiveProgressView: + def test_live_progress_ignores_unscoped_running_processes_when_no_crawls(self, client, admin_user, db): + import os + import archivebox.machine.models as machine_models + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], + env={}, + started_at=timezone.now(), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + assert payload["active_crawls"] == [] + assert payload["total_workers"] == 0 + + def test_live_progress_cleans_stale_running_processes(self, client, admin_user, db): + from datetime import timedelta + import archivebox.machine.models as machine_models + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + proc = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=999999, + cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], + env={}, + started_at=timezone.now() - timedelta(days=2), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + proc.refresh_from_db() + assert proc.status == Process.StatusChoices.EXITED + assert proc.ended_at is not None + assert response.json()["total_workers"] == 0 + + def test_live_progress_routes_crawl_process_rows_to_crawl_setup(self, client, admin_user, snapshot, db): + import os + import archivebox.machine.models as machine_models + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + pid = os.getpid() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=pid, + pwd=str(snapshot.output_dir / "chrome"), + cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"], + started_at=timezone.now(), + ) + + client.login(username="testadmin", password="testpassword") + with ( + patch.object(Process, "cleanup_stale_running", return_value=0), + patch.object(Process, "cleanup_orphaned_workers", return_value=0), + ): + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(crawl for crawl in payload["active_crawls"] if crawl["id"] == str(snapshot.crawl_id)) + setup_entry = next(item for item in active_crawl["setup_plugins"] if item["source"] == "process") + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.id)) + assert setup_entry["label"] == "chrome wait" + assert setup_entry["status"] == "started" + assert active_crawl["worker_pid"] == pid + assert active_snapshot["all_plugins"] == [] + + def test_live_progress_uses_snapshot_process_rows_before_archiveresults(self, client, admin_user, snapshot, db): + import os + import archivebox.machine.models as machine_models + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + pid = os.getpid() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=pid, + pwd=str(snapshot.output_dir / "title"), + cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], + started_at=timezone.now(), + ) + + client.login(username="testadmin", password="testpassword") + with ( + patch.object(Process, "cleanup_stale_running", return_value=0), + patch.object(Process, "cleanup_orphaned_workers", return_value=0), + ): + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(crawl for crawl in payload["active_crawls"] if crawl["id"] == str(snapshot.crawl_id)) + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.id)) + assert active_snapshot["all_plugins"][0]["source"] == "process" + assert active_snapshot["all_plugins"][0]["label"] == "title" + assert active_snapshot["all_plugins"][0]["status"] == "started" + assert active_snapshot["worker_pid"] == pid + + def test_live_progress_merges_process_rows_with_archiveresults_when_present(self, client, admin_user, snapshot, db): + import os + import archivebox.machine.models as machine_models + from archivebox.core.models import ArchiveResult + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + pwd=str(snapshot.output_dir / "chrome"), + cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"], + started_at=timezone.now(), + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + status=ArchiveResult.StatusChoices.STARTED, + ) + + client.login(username="testadmin", password="testpassword") + with ( + patch.object(Process, "cleanup_stale_running", return_value=0), + patch.object(Process, "cleanup_orphaned_workers", return_value=0), + ): + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(crawl for crawl in payload["active_crawls"] if crawl["id"] == str(snapshot.crawl_id)) + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.id)) + sources = {item["source"] for item in active_snapshot["all_plugins"]} + plugins = {item["plugin"] for item in active_snapshot["all_plugins"]} + assert sources == {"archiveresult", "process"} + assert "title" in plugins + assert "chrome" in plugins + + def test_live_progress_omits_pid_for_exited_process_rows(self, client, admin_user, snapshot, db): + import archivebox.machine.models as machine_models + from archivebox.machine.models import Machine, Process + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.EXITED, + exit_code=0, + pid=99999, + pwd=str(snapshot.output_dir / "title"), + cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], + started_at=timezone.now(), + ended_at=timezone.now(), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("live_progress"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + payload = response.json() + active_crawl = next(crawl for crawl in payload["active_crawls"] if crawl["id"] == str(snapshot.crawl_id)) + active_snapshot = next(item for item in active_crawl["active_snapshots"] if item["id"] == str(snapshot.id)) + process_entry = next(item for item in active_snapshot["all_plugins"] if item["source"] == "process") + assert process_entry["status"] == "succeeded" + assert "pid" not in process_entry + + +class TestAdminSnapshotSearch: + """Tests for admin snapshot search functionality.""" + + def test_admin_search_mode_selector_defaults_to_meta_for_ripgrep(self, client, admin_user, monkeypatch): + monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "ripgrep") + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:core_snapshot_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b'name="search_mode" value="meta" checked' in response.content + assert b'name="search_mode" value="contents"' in response.content + assert b'name="search_mode" value="deep"' in response.content + + def test_admin_search_mode_selector_defaults_to_contents_for_non_ripgrep(self, client, admin_user, monkeypatch): + monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "sqlite") + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:core_snapshot_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b'name="search_mode" value="contents" checked' in response.content + + def test_admin_search_mode_selector_stays_checked_after_search(self, client, admin_user, crawl, monkeypatch): + from archivebox.core.models import Snapshot + + Snapshot.objects.create( + url="https://example.com/fulltext-only", + title="Unrelated Title", + crawl=crawl, + ) + + monkeypatch.setattr( + "archivebox.search.admin.query_search_index", + lambda query, search_mode=None: Snapshot.objects.all(), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get( + reverse("admin:core_snapshot_changelist"), + {"q": "google", "search_mode": "contents"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert b'name="search_mode" value="contents" checked' in response.content + assert b'name="search_mode" value="meta" checked' not in response.content + assert b'id="changelist"' in response.content + assert b"search-mode-contents" in response.content + + def test_admin_search_mode_adds_result_tint_class_for_deep(self, client, admin_user, crawl, monkeypatch): + from archivebox.core.models import Snapshot + + Snapshot.objects.create( + url="https://example.com/fulltext-only", + title="Unrelated Title", + crawl=crawl, + ) + + monkeypatch.setattr( + "archivebox.search.admin.query_search_index", + lambda query, search_mode=None: Snapshot.objects.all(), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get( + reverse("admin:core_snapshot_changelist"), + {"q": "google", "search_mode": "deep"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert b'name="search_mode" value="deep" checked' in response.content + assert b"search-mode-deep" in response.content + assert b"search-rank-" in response.content + + def test_deep_search_assigns_metadata_contents_and_deep_only_ranks(self, client, admin_user, crawl, monkeypatch): + from archivebox.core.models import Snapshot + + metadata_snapshot = Snapshot.objects.create( + url="https://example.com/google-meta", + title="Google Metadata Match", + crawl=crawl, + ) + contents_snapshot = Snapshot.objects.create( + url="https://example.com/contents-only", + title="Unrelated Title", + crawl=crawl, + ) + deep_snapshot = Snapshot.objects.create( + url="https://example.com/deep-only", + title="Unrelated Title", + crawl=crawl, + ) + + def fake_query_search_index(query, search_mode=None): + if search_mode == "contents": + return Snapshot.objects.filter(pk=contents_snapshot.pk) + if search_mode == "deep": + return Snapshot.objects.filter(pk__in=[contents_snapshot.pk, deep_snapshot.pk]) + return Snapshot.objects.none() + + monkeypatch.setattr("archivebox.search.admin.query_search_index", fake_query_search_index) + + client.login(username="testadmin", password="testpassword") + response = client.get( + reverse("admin:core_snapshot_changelist"), + {"q": "google", "search_mode": "deep"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + ranks = dict(response.context["cl"].queryset.values_list("pk", "search_rank")) + assert ranks[metadata_snapshot.pk] == 0 + assert ranks[contents_snapshot.pk] == 1 + assert ranks[deep_snapshot.pk] == 2 + assert b'class="search-rank-0"' in response.content + assert b'class="search-rank-1"' in response.content + assert b'class="search-rank-2"' in response.content + + def test_search_ranks_metadata_matches_before_fulltext_by_default(self, client, admin_user, crawl, monkeypatch): + from archivebox.core.models import Snapshot + + metadata_snapshot = Snapshot.objects.create( + url="https://example.com/google-meta", + title="Google Metadata Match", + crawl=crawl, + ) + fulltext_snapshot = Snapshot.objects.create( + url="https://example.com/fulltext-only", + title="Unrelated Title", + crawl=crawl, + ) + + monkeypatch.setattr( + "archivebox.search.admin.query_search_index", + lambda query, search_mode=None: Snapshot.objects.filter(pk=fulltext_snapshot.pk), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get(reverse("admin:core_snapshot_changelist"), {"q": "google", "search_mode": "contents"}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + result_ids = list(response.context["cl"].queryset.values_list("pk", flat=True)) + assert result_ids[:2] == [metadata_snapshot.pk, fulltext_snapshot.pk] + + def test_manual_admin_sort_ignores_search_rank_ordering(self, client, admin_user, crawl, monkeypatch): + from archivebox.core.models import Snapshot + + metadata_snapshot = Snapshot.objects.create( + url="https://example.com/google-meta", + title="Google Metadata Match", + crawl=crawl, + ) + fulltext_snapshot = Snapshot.objects.create( + url="https://example.com/fulltext-only", + title="Unrelated Title", + crawl=crawl, + ) + + monkeypatch.setattr( + "archivebox.search.admin.query_search_index", + lambda query, search_mode=None: Snapshot.objects.filter(pk=fulltext_snapshot.pk), + ) + + client.login(username="testadmin", password="testpassword") + response = client.get( + reverse("admin:core_snapshot_changelist"), + {"q": "google", "search_mode": "contents", "o": "-0"}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + result_ids = list(response.context["cl"].queryset.values_list("pk", flat=True)) + assert result_ids[:2] == [fulltext_snapshot.pk, metadata_snapshot.pk] + + def test_search_by_url(self, client, admin_user, snapshot): + """Test searching snapshots by URL.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, {"q": "example.com"}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + # The search should find the example.com snapshot + assert b"example.com" in response.content + + def test_search_by_title(self, client, admin_user, crawl, db): + """Test searching snapshots by title.""" + from archivebox.core.models import Snapshot + + Snapshot.objects.create( + url="https://example.com/titled", + title="Unique Title For Testing", + crawl=crawl, + ) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, {"q": "Unique Title"}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + def test_search_by_tag(self, client, admin_user, snapshot, db): + """Test searching snapshots by tag.""" + from archivebox.core.models import Tag + + tag = Tag.objects.create(name="test-search-tag") + snapshot.tags.add(tag) + + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, {"q": "test-search-tag"}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + def test_empty_search(self, client, admin_user): + """Test empty search returns all snapshots.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, {"q": ""}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + def test_no_results_search(self, client, admin_user): + """Test search with no results.""" + client.login(username="testadmin", password="testpassword") + url = reverse("admin:core_snapshot_changelist") + response = client.get(url, {"q": "nonexistent-url-xyz789"}, HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + + +class TestPublicIndexSearch: + """Tests for public index search functionality.""" + + @pytest.fixture + def public_snapshot(self, crawl, db): + """Create sealed snapshot for public index.""" + from archivebox.core.models import Snapshot + + return Snapshot.objects.create( + url="https://public-example.com", + title="Public Example Website", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_by_url(self, client, public_snapshot): + """Test public search by URL.""" + response = client.get("/public/", {"q": "public-example.com"}, HTTP_HOST=PUBLIC_HOST) + assert response.status_code == 200 + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_mode_selector_defaults_to_meta_for_ripgrep(self, client, monkeypatch): + monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "ripgrep") + + response = client.get("/public/", HTTP_HOST=PUBLIC_HOST) + + assert response.status_code == 200 + assert b'name="search_mode" value="meta" checked' in response.content + + def test_public_search_ranks_metadata_matches_before_fulltext(self, crawl, monkeypatch): + from archivebox.core.models import Snapshot + from archivebox.core.views import PublicIndexView + + metadata_snapshot = Snapshot.objects.create( + url="https://public-example.com/google-meta", + title="Google Metadata Match", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + fulltext_snapshot = Snapshot.objects.create( + url="https://public-example.com/fulltext-only", + title="Unrelated Title", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + + monkeypatch.setattr( + "archivebox.core.views.query_search_index", + lambda query, search_mode=None: Snapshot.objects.filter(pk=fulltext_snapshot.pk), + ) + + request = RequestFactory().get("/public/", {"q": "google", "search_mode": "contents"}) + view = PublicIndexView() + view.request = request + + result_ids = list(view.get_queryset().values_list("pk", flat=True)) + assert result_ids[:2] == [metadata_snapshot.pk, fulltext_snapshot.pk] + + @override_settings(PUBLIC_INDEX=True) + def test_public_index_redirects_logged_in_users_to_admin_snapshot_list(self, client, admin_user): + client.force_login(admin_user) + + response = client.get("/public/", HTTP_HOST=PUBLIC_HOST) + + assert response.status_code == 302 + assert response["Location"] == "/admin/core/snapshot/" + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_by_title(self, client, public_snapshot): + """Test public search by title.""" + response = client.get("/public/", {"q": "Public Example"}, HTTP_HOST=PUBLIC_HOST) + assert response.status_code == 200 + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_query_type_meta(self, client, public_snapshot): + """Test public search with query_type=meta.""" + response = client.get("/public/", {"q": "example", "query_type": "meta"}, HTTP_HOST=PUBLIC_HOST) + assert response.status_code == 200 + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_query_type_url(self, client, public_snapshot): + """Test public search with query_type=url.""" + response = client.get("/public/", {"q": "public-example.com", "query_type": "url"}, HTTP_HOST=PUBLIC_HOST) + assert response.status_code == 200 + + @override_settings(PUBLIC_INDEX=True) + def test_public_search_query_type_title(self, client, public_snapshot): + """Test public search with query_type=title.""" + response = client.get("/public/", {"q": "Website", "query_type": "title"}, HTTP_HOST=PUBLIC_HOST) + assert response.status_code == 200 diff --git a/archivebox/tests/test_api_cli_schedule.py b/archivebox/tests/test_api_cli_schedule.py new file mode 100644 index 0000000000..aeab36367f --- /dev/null +++ b/archivebox/tests/test_api_cli_schedule.py @@ -0,0 +1,36 @@ +from io import StringIO + +from django.contrib.auth import get_user_model +from django.test import RequestFactory, TestCase + +from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule +from archivebox.crawls.models import CrawlSchedule + +User = get_user_model() + + +class CLIScheduleAPITests(TestCase): + def setUp(self): + self.user = User.objects.create_user( + username="api-user", + password="testpass123", + email="api@example.com", + ) + + def test_schedule_api_creates_schedule(self): + request = RequestFactory().post("/api/v1/cli/schedule") + request.user = self.user + setattr(request, "stdout", StringIO()) + setattr(request, "stderr", StringIO()) + args = ScheduleCommandSchema( + every="daily", + import_path="https://example.com/feed.xml", + quiet=True, + ) + + response = cli_schedule(request, args) + + self.assertTrue(response["success"]) + self.assertEqual(response["result_format"], "json") + self.assertEqual(CrawlSchedule.objects.count(), 1) + self.assertEqual(len(response["result"]["created_schedule_ids"]), 1) diff --git a/archivebox/tests/test_archive_result_service.py b/archivebox/tests/test_archive_result_service.py new file mode 100644 index 0000000000..14ad8168f5 --- /dev/null +++ b/archivebox/tests/test_archive_result_service.py @@ -0,0 +1,510 @@ +from pathlib import Path +from uuid import uuid4 + +import pytest +from django.db import connection + + +from abx_dl.events import ArchiveResultEvent, BinaryRequestEvent, ProcessEvent, ProcessStartedEvent +from abx_dl.orchestrator import create_bus +from abx_dl.output_files import OutputFile + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _cleanup_machine_process_rows() -> None: + with connection.cursor() as cursor: + cursor.execute("DELETE FROM machine_process") + + +def _create_snapshot(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + return Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + +def _create_machine(): + from archivebox.machine.models import Machine + + return Machine.objects.create( + guid=f"test-guid-{uuid4()}", + hostname="test-host", + hw_in_docker=False, + hw_in_vm=False, + hw_manufacturer="Test", + hw_product="Test Product", + hw_uuid=f"test-hw-{uuid4()}", + os_arch="arm64", + os_family="darwin", + os_platform="macOS", + os_release="14.0", + os_kernel="Darwin", + stats={}, + config={}, + ) + + +def _create_iface(machine): + from archivebox.machine.models import NetworkInterface + + return NetworkInterface.objects.create( + machine=machine, + mac_address="00:11:22:33:44:55", + ip_public="203.0.113.10", + ip_local="10.0.0.10", + dns_server="1.1.1.1", + hostname="test-host", + iface="en0", + isp="Test ISP", + city="Test City", + region="Test Region", + country="Test Country", + ) + + +def test_process_completed_projects_inline_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "wget" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "index.html").write_text("ok") + + bus = create_bus(name="test_inline_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status="succeeded", + output_str="wget/index.html", + output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)], + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg") + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert result.output_str == "wget/index.html" + assert "index.html" in result.output_files + assert result.output_files["index.html"] == {"extension": "html", "mimetype": "text/html", "size": 15} + assert result.output_size == 15 + _cleanup_machine_process_rows() + + +def test_process_completed_projects_synthetic_failed_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "chrome" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_synthetic_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + status="failed", + output_str="Hook timed out after 60 seconds", + error="Hook timed out after 60 seconds", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:01:00+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait") + assert result.status == ArchiveResult.StatusChoices.FAILED + assert result.output_str == "Hook timed out after 60 seconds" + assert "Hook timed out" in result.notes + _cleanup_machine_process_rows() + + +def test_process_completed_projects_noresults_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_noresults_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js") + assert result.status == ArchiveResult.StatusChoices.NORESULTS + assert result.output_str == "No title found" + + +def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state(): + from archivebox.core.models import ArchiveResult, Snapshot + + snapshot = _create_snapshot() + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + status=ArchiveResult.StatusChoices.FAILED, + output_str="timed out", + output_files={"stderr.log": {}}, + output_size=123, + output_mimetypes="text/plain", + ) + + reset_count = snapshot.retry_failed_archiveresults() + + snapshot.refresh_from_db() + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait") + assert reset_count == 1 + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert snapshot.current_step == 0 + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert result.output_str == "" + assert result.output_json is None + assert result.output_files == {} + assert result.output_size == 0 + assert result.output_mimetypes == "" + assert result.start_ts is None + assert result.end_ts is None + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + _cleanup_machine_process_rows() + + +def test_process_completed_projects_snapshot_title_from_output_str(): + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_snapshot_title_output_str") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="succeeded", + output_str="Example Domain", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + snapshot.refresh_from_db() + assert snapshot.title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_process_completed_projects_snapshot_title_from_title_file(): + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "title.txt").write_text("Example Domain") + + bus = create_bus(name="test_snapshot_title_file") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)], + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + snapshot.refresh_from_db() + assert snapshot.title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_snapshot_resolved_title_falls_back_to_title_file_without_db_title(): + from archivebox.core.models import ArchiveResult + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "title.txt").write_text("Example Domain") + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + output_files={"title.txt": {}}, + ) + + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + assert snapshot.resolved_title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_collect_output_metadata_preserves_file_metadata(): + from archivebox.services.archive_result_service import _resolve_output_metadata + + output_files, output_size, output_mimetypes = _resolve_output_metadata( + [OutputFile(path="index.html", extension="html", mimetype="text/html", size=42)], + Path("/tmp/does-not-need-to-exist"), + ) + + assert output_files == { + "index.html": { + "extension": "html", + "mimetype": "text/html", + "size": 42, + }, + } + assert output_size == 42 + assert output_mimetypes == "text/html" + + +def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path): + from archivebox.services.archive_result_service import _collect_output_metadata + + plugin_dir = tmp_path / "wget" + warc_file = plugin_dir / "warc" / "capture.warc.gz" + warc_file.parent.mkdir(parents=True, exist_ok=True) + warc_file.write_bytes(b"warc-bytes") + + output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) + + assert output_files["warc/capture.warc.gz"] == { + "extension": "gz", + "mimetype": "application/warc", + "size": 10, + } + assert output_size == 10 + assert output_mimetypes == "application/warc" + + +@pytest.mark.django_db(transaction=True) +def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch, tmp_path): + from archivebox.machine.models import Binary, NetworkInterface + from archivebox.machine.models import Process as MachineProcess + from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService + from abx_dl.services.process_service import ProcessService as DlProcessService + + machine = _create_machine() + iface = _create_iface(machine) + monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface)) + + binary = Binary.objects.create( + machine=machine, + name="postlight-parser", + abspath="/tmp/postlight-parser", + version="2.2.3", + binprovider="npm", + binproviders="npm", + status=Binary.StatusChoices.INSTALLED, + ) + + hook_path = tmp_path / "on_Snapshot__57_mercury.py" + hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8") + hook_path.chmod(0o755) + output_dir = tmp_path / "mercury" + output_dir.mkdir() + + bus = create_bus(name="test_process_started_binary_hydration") + DlProcessService(bus, emit_jsonl=False, interactive_tty=False) + ArchiveBoxProcessService(bus) + + async def run_test() -> None: + await bus.emit( + ProcessEvent( + plugin_name="mercury", + hook_name="on_Snapshot__57_mercury.py", + hook_path=str(hook_path), + hook_args=["--url=https://example.com"], + is_background=False, + output_dir=str(output_dir), + env={ + "MERCURY_BINARY": binary.abspath, + "NODE_BINARY": "/tmp/node", + }, + timeout=60, + url="https://example.com", + ), + ) + started = await bus.find( + ProcessStartedEvent, + past=True, + future=False, + hook_name="on_Snapshot__57_mercury.py", + output_dir=str(output_dir), + ) + assert started is not None + + import asyncio + + asyncio.run(run_test()) + + process = MachineProcess.objects.get( + pwd=str(output_dir), + cmd=[str(hook_path), "--url=https://example.com"], + ) + assert process.binary_id == binary.id + assert process.iface_id == iface.id + + +@pytest.mark.django_db(transaction=True) +def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch, tmp_path): + from archivebox.machine.models import Binary, NetworkInterface + from archivebox.machine.models import Process as MachineProcess + from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService + from abx_dl.services.process_service import ProcessService as DlProcessService + + machine = _create_machine() + iface = _create_iface(machine) + monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface)) + + node = Binary.objects.create( + machine=machine, + name="node", + abspath="/tmp/node", + version="22.0.0", + binprovider="env", + binproviders="env", + status=Binary.StatusChoices.INSTALLED, + ) + + hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js" + hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8") + hook_path.chmod(0o755) + output_dir = tmp_path / "parse-dom-outlinks" + output_dir.mkdir() + + bus = create_bus(name="test_process_started_node_fallback") + DlProcessService(bus, emit_jsonl=False, interactive_tty=False) + ArchiveBoxProcessService(bus) + + async def run_test() -> None: + await bus.emit( + ProcessEvent( + plugin_name="parse_dom_outlinks", + hook_name="on_Snapshot__75_parse_dom_outlinks.js", + hook_path=str(hook_path), + hook_args=["--url=https://example.com"], + is_background=False, + output_dir=str(output_dir), + env={"NODE_BINARY": node.abspath}, + timeout=60, + url="https://example.com", + ), + ) + started = await bus.find( + ProcessStartedEvent, + past=True, + future=False, + hook_name="on_Snapshot__75_parse_dom_outlinks.js", + output_dir=str(output_dir), + ) + assert started is not None + + import asyncio + + asyncio.run(run_test()) + + process = MachineProcess.objects.get( + pwd=str(output_dir), + cmd=[str(hook_path), "--url=https://example.com"], + ) + assert process.binary_id == node.id + assert process.iface_id == iface.id + + +def test_binary_event_reuses_existing_installed_binary_row(monkeypatch): + from archivebox.machine.models import Binary, Machine + from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService + import asyncio + + machine = _create_machine() + monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine)) + + binary = Binary.objects.create( + machine=machine, + name="wget", + abspath="/bin/sh", + version="9.9.9", + binprovider="env", + binproviders="env,apt,brew", + status=Binary.StatusChoices.INSTALLED, + ) + + service = ArchiveBoxBinaryService(create_bus(name="test_binary_event_reuses_existing_installed_binary_row")) + event = BinaryRequestEvent( + name="wget", + plugin_name="wget", + output_dir="/tmp/wget", + binproviders="provider", + ) + + asyncio.run(service.on_BinaryRequestEvent(event)) + + binary.refresh_from_db() + assert Binary.objects.filter(machine=machine, name="wget").count() == 1 + assert binary.status == Binary.StatusChoices.INSTALLED + assert binary.abspath == "/bin/sh" + assert binary.version == "9.9.9" + assert binary.binprovider == "env" + assert binary.binproviders == "provider" diff --git a/archivebox/tests/test_auth_ldap.py b/archivebox/tests/test_auth_ldap.py new file mode 100644 index 0000000000..2364b16be1 --- /dev/null +++ b/archivebox/tests/test_auth_ldap.py @@ -0,0 +1,218 @@ +""" +LDAP authentication tests for ArchiveBox. + +Tests LDAP configuration, validation, and integration with Django. +Per CLAUDE.md: NO MOCKS, NO SKIPS - all tests use real code paths. +""" + +import os +import sys +import tempfile +import unittest +from importlib.util import find_spec + + +class TestLDAPConfig(unittest.TestCase): + """Test LDAP configuration loading and validation.""" + + def test_ldap_config_defaults(self): + """Test that LDAP config loads with correct defaults.""" + from archivebox.config.ldap import LDAP_CONFIG + + # Check default values + self.assertFalse(LDAP_CONFIG.LDAP_ENABLED) + self.assertIsNone(LDAP_CONFIG.LDAP_SERVER_URI) + self.assertIsNone(LDAP_CONFIG.LDAP_BIND_DN) + self.assertIsNone(LDAP_CONFIG.LDAP_BIND_PASSWORD) + self.assertIsNone(LDAP_CONFIG.LDAP_USER_BASE) + self.assertEqual(LDAP_CONFIG.LDAP_USER_FILTER, "(uid=%(user)s)") + self.assertEqual(LDAP_CONFIG.LDAP_USERNAME_ATTR, "username") + self.assertEqual(LDAP_CONFIG.LDAP_FIRSTNAME_ATTR, "givenName") + self.assertEqual(LDAP_CONFIG.LDAP_LASTNAME_ATTR, "sn") + self.assertEqual(LDAP_CONFIG.LDAP_EMAIL_ATTR, "mail") + self.assertFalse(LDAP_CONFIG.LDAP_CREATE_SUPERUSER) + + def test_ldap_config_validation_disabled(self): + """Test that validation passes when LDAP is disabled.""" + from archivebox.config.ldap import LDAPConfig + + config = LDAPConfig(LDAP_ENABLED=False) + is_valid, error_msg = config.validate_ldap_config() + + self.assertTrue(is_valid) + self.assertEqual(error_msg, "") + + def test_ldap_config_validation_missing_fields(self): + """Test that validation fails when required fields are missing.""" + from archivebox.config.ldap import LDAPConfig + + # Enable LDAP but don't provide required fields + config = LDAPConfig(LDAP_ENABLED=True) + is_valid, error_msg = config.validate_ldap_config() + + self.assertFalse(is_valid) + self.assertIn("LDAP_* config options must all be set", error_msg) + self.assertIn("LDAP_SERVER_URI", error_msg) + self.assertIn("LDAP_BIND_DN", error_msg) + self.assertIn("LDAP_BIND_PASSWORD", error_msg) + self.assertIn("LDAP_USER_BASE", error_msg) + + def test_ldap_config_validation_complete(self): + """Test that validation passes when all required fields are provided.""" + from archivebox.config.ldap import LDAPConfig + + config = LDAPConfig( + LDAP_ENABLED=True, + LDAP_SERVER_URI="ldap://ldap-test.localhost:389", + LDAP_BIND_DN="cn=admin,dc=example,dc=com", + LDAP_BIND_PASSWORD="password", + LDAP_USER_BASE="ou=users,dc=example,dc=com", + ) + is_valid, error_msg = config.validate_ldap_config() + + self.assertTrue(is_valid) + self.assertEqual(error_msg, "") + + def test_ldap_config_in_get_config(self): + """Test that LDAP_CONFIG is included in get_CONFIG().""" + from archivebox.config import get_CONFIG + + all_config = get_CONFIG() + self.assertIn("LDAP_CONFIG", all_config) + self.assertEqual(all_config["LDAP_CONFIG"].__class__.__name__, "LDAPConfig") + + +class TestLDAPIntegration(unittest.TestCase): + """Test LDAP integration with Django settings.""" + + def test_django_settings_without_ldap_enabled(self): + """Test that Django settings work correctly when LDAP is disabled.""" + # Import Django settings (LDAP_ENABLED should be False by default) + from django.conf import settings + + # Should have default authentication backends + self.assertIn("django.contrib.auth.backends.RemoteUserBackend", settings.AUTHENTICATION_BACKENDS) + self.assertIn("django.contrib.auth.backends.ModelBackend", settings.AUTHENTICATION_BACKENDS) + + # LDAP backend should not be present when disabled + ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()] + self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when LDAP_ENABLED=False") + + def test_django_settings_with_ldap_library_check(self): + """Test that Django settings check for LDAP libraries when enabled.""" + ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None + + # If LDAP libraries are not available, settings should handle gracefully + if not ldap_available: + # Settings should have loaded without LDAP backend + from django.conf import settings + + ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()] + self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when libraries unavailable") + + +class TestLDAPAuthBackend(unittest.TestCase): + """Test custom LDAP authentication backend.""" + + def test_ldap_backend_class_exists(self): + """Test that ArchiveBoxLDAPBackend class is defined.""" + from archivebox.ldap.auth import ArchiveBoxLDAPBackend + + self.assertTrue(hasattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user")) + + def test_ldap_backend_inherits_correctly(self): + """Test that ArchiveBoxLDAPBackend has correct inheritance.""" + from archivebox.ldap.auth import ArchiveBoxLDAPBackend + + # Should have authenticate_ldap_user method (from base or overridden) + self.assertTrue(callable(getattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user", None))) + + +class TestArchiveBoxWithLDAP(unittest.TestCase): + """Test ArchiveBox commands with LDAP configuration.""" + + def setUp(self): + """Set up test environment.""" + self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-test-") + + def test_archivebox_init_without_ldap(self): + """Test that archivebox init works without LDAP enabled.""" + import subprocess + + # Run archivebox init + result = subprocess.run( + [sys.executable, "-m", "archivebox", "init"], + cwd=self.work_dir, + capture_output=True, + timeout=45, + env={ + **os.environ, + "DATA_DIR": self.work_dir, + "LDAP_ENABLED": "False", + }, + ) + + # Should succeed + self.assertEqual(result.returncode, 0, f"archivebox init failed: {result.stderr.decode()}") + + def test_archivebox_version_with_ldap_config(self): + """Test that archivebox version works with LDAP config set.""" + import subprocess + + # Run archivebox version with LDAP config env vars + result = subprocess.run( + [sys.executable, "-m", "archivebox", "version"], + cwd=self.work_dir, + capture_output=True, + timeout=10, + env={ + **os.environ, + "DATA_DIR": self.work_dir, + "LDAP_ENABLED": "False", + "LDAP_SERVER_URI": "ldap://ldap-test.localhost:389", + }, + ) + + # Should succeed + self.assertEqual(result.returncode, 0, f"archivebox version failed: {result.stderr.decode()}") + + +class TestLDAPConfigValidationInArchiveBox(unittest.TestCase): + """Test LDAP config validation when running ArchiveBox commands.""" + + def setUp(self): + """Set up test environment.""" + self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-validation-") + + def test_archivebox_init_with_incomplete_ldap_config(self): + """Test that archivebox init fails with helpful error when LDAP config is incomplete.""" + import subprocess + + # Run archivebox init with LDAP enabled but missing required fields + result = subprocess.run( + [sys.executable, "-m", "archivebox", "init"], + cwd=self.work_dir, + capture_output=True, + timeout=45, + env={ + **os.environ, + "DATA_DIR": self.work_dir, + "LDAP_ENABLED": "True", + # Missing: LDAP_SERVER_URI, LDAP_BIND_DN, etc. + }, + ) + + # Should fail with validation error + self.assertNotEqual(result.returncode, 0, "Should fail with incomplete LDAP config") + + # Check error message + stderr = result.stderr.decode() + self.assertIn( + "LDAP_* config options must all be set", + stderr, + f"Expected validation error message in: {stderr}", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py new file mode 100644 index 0000000000..95052b484d --- /dev/null +++ b/archivebox/tests/test_cli_add.py @@ -0,0 +1,461 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox add command. +Verify add creates snapshots in DB, crawls, source files, and archive directories. +""" + +import os +import sqlite3 +import subprocess +from pathlib import Path + + +def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: + candidates = {snapshot_id} + if len(snapshot_id) == 32: + candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}") + elif len(snapshot_id) == 36 and "-" in snapshot_id: + candidates.add(snapshot_id.replace("-", "")) + + for needle in candidates: + for path in data_dir.rglob(needle): + if path.is_dir(): + return path + return None + + +def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict): + """Test that adding a single URL creates a snapshot in the database.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshots = c.execute("SELECT url FROM core_snapshot").fetchall() + conn.close() + + assert len(snapshots) == 1 + assert snapshots[0][0] == "https://example.com" + + +def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict): + """Background add should create root snapshots immediately so the queue is visible in the DB.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--bg", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshots = c.execute("SELECT url, status FROM core_snapshot").fetchall() + conn.close() + + assert len(snapshots) == 1 + assert snapshots[0][0] == "https://example.com" + assert snapshots[0][1] == "queued" + + +def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict): + """Test that add command creates a Crawl record in the database.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] + conn.close() + + assert crawl_count == 1 + + +def test_add_creates_source_file(tmp_path, process, disable_extractors_dict): + """Test that add creates a source file with the URL.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + sources_dir = tmp_path / "sources" + assert sources_dir.exists() + + source_files = list(sources_dir.glob("*cli_add.txt")) + assert len(source_files) >= 1 + + source_content = source_files[0].read_text() + assert "https://example.com" in source_content + + +def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict): + """Test adding multiple URLs in a single command.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com", "https://example.org"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall() + conn.close() + + assert snapshot_count == 2 + assert urls[0][0] == "https://example.com" + assert urls[1][0] == "https://example.org" + + +def test_add_from_file(tmp_path, process, disable_extractors_dict): + """Test adding URLs from a file. + + The add command should treat a file argument as URL input and create snapshots + for each URL it contains. + """ + os.chdir(tmp_path) + + # Create a file with URLs + urls_file = tmp_path / "urls.txt" + urls_file.write_text("https://example.com\nhttps://example.org\n") + + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", str(urls_file)], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] + snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + # The file is parsed into two input URLs. + assert crawl_count == 1 + assert snapshot_count == 2 + + +def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict): + """Test that --depth=0 flag is accepted and works.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8") + + +def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict): + """Test that --depth=1 flag is accepted.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=1", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8") + + +def test_add_rejects_invalid_depth_values(tmp_path, process, disable_extractors_dict): + """Test that add rejects depth values outside the supported range.""" + os.chdir(tmp_path) + + for depth in ("5", "-1"): + result = subprocess.run( + ["archivebox", "add", "--index-only", f"--depth={depth}", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + stderr = result.stderr.decode("utf-8").lower() + assert result.returncode != 0 + assert "invalid" in stderr or "not one of" in stderr + + +def test_add_with_tags(tmp_path, process, disable_extractors_dict): + """Test adding URL with tags stores tags_str in crawl. + + With --index-only, Tag objects are not created until archiving happens. + Tags are stored as a string in the Crawl.tags_str field. + """ + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0] + conn.close() + + # Tags are stored as a comma-separated string in crawl + assert "test" in tags_str or "example" in tags_str + + +def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict): + """Test add persists the selected persona so browser config derives from it later.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "--persona=Default", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + persona_id, default_persona = c.execute( + "SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1", + ).fetchone() + conn.close() + + assert persona_id + assert default_persona == "Default" + assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir() + + +def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_extractors_dict): + os.chdir(tmp_path) + result = subprocess.run( + [ + "archivebox", + "add", + "--index-only", + "--depth=0", + "--domain-allowlist=example.com,*.example.com", + "--domain-denylist=static.example.com", + "https://example.com", + ], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + allowlist, denylist = c.execute( + "SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1", + ).fetchone() + conn.close() + + assert allowlist == "example.com,*.example.com" + assert denylist == "static.example.com" + assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir() + + +def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict): + """Test that adding the same URL twice creates separate crawls and snapshots. + + Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL. + This allows re-archiving URLs at different times. + """ + os.chdir(tmp_path) + + # Add URL first time + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Add same URL second time + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0] + crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] + conn.close() + + # Each add creates a new crawl with its own snapshot + assert crawl_count == 2 + assert snapshot_count == 2 + + +def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict): + """Test that --overwrite flag forces re-archiving.""" + os.chdir(tmp_path) + + # Add URL first time + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Add with overwrite + result = subprocess.run( + ["archivebox", "add", "--index-only", "--overwrite", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --overwrite" not in result.stderr.decode("utf-8") + + +def test_add_creates_snapshot_output_directory(tmp_path, process, disable_extractors_dict): + """Test that add creates the current snapshot output directory on disk.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = str(c.execute("SELECT id FROM core_snapshot").fetchone()[0]) + conn.close() + + snapshot_dir = _find_snapshot_dir(tmp_path, snapshot_id) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + assert snapshot_dir.is_dir() + + +def test_add_help_shows_depth_and_tag_options(tmp_path, process): + """Test that add --help documents the main filter and crawl options.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "add", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "--depth" in result.stdout + assert "--max-urls" in result.stdout + assert "--max-size" in result.stdout + assert "--tag" in result.stdout + + +def test_add_records_max_url_and_size_limits_on_crawl(tmp_path, process, disable_extractors_dict): + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=1", "--max-urls=3", "--max-size=45mb", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + max_urls, max_size, config_max_urls, config_max_size = c.execute( + "SELECT max_urls, max_size, json_extract(config, '$.MAX_URLS'), json_extract(config, '$.MAX_SIZE') FROM crawls_crawl LIMIT 1", + ).fetchone() + conn.close() + + assert max_urls == 3 + assert max_size == 45 * 1024 * 1024 + assert config_max_urls == 3 + assert config_max_size == 45 * 1024 * 1024 + + +def test_add_without_args_shows_usage(tmp_path, process): + """Test that add without URLs fails with a usage hint instead of crashing.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "add"], + capture_output=True, + text=True, + ) + + combined = result.stdout + result.stderr + assert result.returncode != 0 + assert "usage" in combined.lower() or "url" in combined.lower() + + +def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict): + """Test that --index-only flag skips extraction (fast).""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, # Should be fast + ) + + assert result.returncode == 0 + + # Snapshot should exist but archive results should be minimal + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert snapshot_count == 1 + + +def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict): + """Test that add links the snapshot to the crawl via crawl_id.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Get crawl id + crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0] + + # Get snapshot's crawl_id + snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0] + + conn.close() + + assert snapshot_crawl == crawl_id + + +def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict): + """Test that add sets a timestamp on the snapshot.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0] + conn.close() + + assert timestamp is not None + assert len(str(timestamp)) > 0 diff --git a/archivebox/tests/test_cli_archiveresult.py b/archivebox/tests/test_cli_archiveresult.py new file mode 100644 index 0000000000..3a4174b61a --- /dev/null +++ b/archivebox/tests/test_cli_archiveresult.py @@ -0,0 +1,336 @@ +""" +Tests for archivebox archiveresult CLI command. + +Tests cover: +- archiveresult create (from Snapshot JSONL, with --plugin, pass-through) +- archiveresult list (with filters) +- archiveresult update +- archiveresult delete +""" + +import json + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, +) + +PROJECTOR_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", +} + + +class TestArchiveResultCreate: + """Tests for `archivebox archiveresult create`.""" + + def test_create_from_snapshot_jsonl(self, initialized_archive): + """Create archive results from Snapshot JSONL input.""" + url = create_test_url() + + # Create a snapshot first + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + # Pipe snapshot to archiveresult create + stdout2, stderr, code = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=title"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Snapshot passed through and an ArchiveResult request emitted + types = [r.get("type") for r in records] + assert "Snapshot" in types + assert "ArchiveResult" in types + + ar = next(r for r in records if r["type"] == "ArchiveResult") + assert ar["plugin"] == "title" + assert "id" not in ar + + def test_create_with_specific_plugin(self, initialized_archive): + """Create archive result for specific plugin.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=screenshot"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + assert len(ar_records) >= 1 + assert ar_records[0]["plugin"] == "screenshot" + + def test_create_pass_through_crawl(self, initialized_archive): + """Pass-through Crawl records unchanged.""" + url = create_test_url() + + # Create crawl and snapshot + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ["snapshot", "create"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + # Now pipe all to archiveresult create + stdout3, stderr, code = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=title"], + stdin=stdout2, + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout3) + + types = [r.get("type") for r in records] + assert "Crawl" in types + assert "Snapshot" in types + assert "ArchiveResult" in types + + def test_create_pass_through_only_when_no_snapshots(self, initialized_archive): + """Only pass-through records but no new snapshots returns success.""" + crawl_record = {"type": "Crawl", "id": "fake-id", "urls": "https://example.com"} + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "create"], + stdin=json.dumps(crawl_record), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Passed through" in stderr + + +class TestArchiveResultList: + """Tests for `archivebox archiveresult list`.""" + + def test_list_empty(self, initialized_archive): + """List with no archive results returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "list"], + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Listed 0 archive results" in stderr + + def test_list_filter_by_status(self, initialized_archive): + """Filter archive results by status.""" + # Create snapshot and materialize an archive result via the runner + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + created = parse_jsonl_output( + run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + )[0], + )[0] + run_archivebox_cmd( + ["archiveresult", "update", "--status=queued"], + stdin=json.dumps(created), + data_dir=initialized_archive, + ) + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "list", "--status=queued"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_filter_by_plugin(self, initialized_archive): + """Filter archive results by plugin.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["plugin"] == "favicon" + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + # Create multiple archive results + for _ in range(3): + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "list", "--limit=2"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestArchiveResultUpdate: + """Tests for `archivebox archiveresult update`.""" + + def test_update_status(self, initialized_archive): + """Update archive result status.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + stdout_run, _, _ = run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + stdout_list, _, _ = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + ) + ar = parse_jsonl_output(stdout_list)[0] + + stdout3, stderr, code = run_archivebox_cmd( + ["archiveresult", "update", "--status=failed"], + stdin=json.dumps(ar), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Updated 1 archive results" in stderr + + records = parse_jsonl_output(stdout3) + assert records[0]["status"] == "failed" + + +class TestArchiveResultDelete: + """Tests for `archivebox archiveresult delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + stdout_run, _, _ = run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + stdout_list, _, _ = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + ) + ar = parse_jsonl_output(stdout_list)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "delete"], + stdin=json.dumps(ar), + data_dir=initialized_archive, + ) + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + stdout_run, _, _ = run_archivebox_cmd( + ["run"], + stdin=stdout2, + data_dir=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + ) + stdout_list, _, _ = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + ) + ar = parse_jsonl_output(stdout_list)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["archiveresult", "delete", "--yes"], + stdin=json.dumps(ar), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Deleted 1 archive results" in stderr diff --git a/archivebox/tests/test_cli_config.py b/archivebox/tests/test_cli_config.py new file mode 100644 index 0000000000..5cb28a4879 --- /dev/null +++ b/archivebox/tests/test_cli_config.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox config command. +Verify config reads/writes ArchiveBox.conf file correctly. +""" + +import os +import subprocess + + +def test_config_displays_all_config(tmp_path, process): + """Test that config without args displays all configuration.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "config"], capture_output=True, text=True) + + assert result.returncode == 0 + output = result.stdout + # Should show config sections + assert len(output) > 100 + # Should show at least some standard config keys + assert "TIMEOUT" in output or "OUTPUT_PERMISSIONS" in output + + +def test_config_get_specific_key(tmp_path, process): + """Test that config --get KEY retrieves specific value.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "config", "--get", "TIMEOUT"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "TIMEOUT" in result.stdout + + +def test_config_set_writes_to_file(tmp_path, process): + """Test that config --set KEY=VALUE writes to ArchiveBox.conf.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=120"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + + # Verify config file was updated + config_file = tmp_path / "ArchiveBox.conf" + assert config_file.exists() + + content = config_file.read_text() + assert "TIMEOUT" in content or "120" in content + + +def test_config_set_and_get_roundtrip(tmp_path, process): + """Test that set value can be retrieved with get.""" + os.chdir(tmp_path) + + # Set a unique value + subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=987"], + capture_output=True, + text=True, + ) + + # Get the value back + result = subprocess.run( + ["archivebox", "config", "--get", "TIMEOUT"], + capture_output=True, + text=True, + ) + + assert "987" in result.stdout + + +def test_config_set_multiple_values(tmp_path, process): + """Test setting multiple config values at once.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=111", "YTDLP_TIMEOUT=222"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + + # Verify both were written + config_file = tmp_path / "ArchiveBox.conf" + content = config_file.read_text() + assert "111" in content + assert "222" in content + + +def test_config_set_invalid_key_fails(tmp_path, process): + """Test that setting invalid config key fails.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "TOTALLY_INVALID_KEY_XYZ=value"], + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + + +def test_config_set_requires_equals_sign(tmp_path, process): + """Test that set requires KEY=VALUE format.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT"], + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + + +def test_config_search_finds_keys(tmp_path, process): + """Test that config --search finds matching keys.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--search", "TIMEOUT"], + capture_output=True, + text=True, + ) + + # Should find timeout-related config + assert "TIMEOUT" in result.stdout + + +def test_config_preserves_existing_values(tmp_path, process): + """Test that setting new values preserves existing ones.""" + os.chdir(tmp_path) + + # Set first value + subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=100"], + capture_output=True, + ) + + # Set second value + subprocess.run( + ["archivebox", "config", "--set", "YTDLP_TIMEOUT=200"], + capture_output=True, + ) + + # Verify both are in config file + config_file = tmp_path / "ArchiveBox.conf" + content = config_file.read_text() + assert "TIMEOUT" in content + assert "YTDLP_TIMEOUT" in content + + +def test_config_file_is_valid_toml(tmp_path, process): + """Test that config file remains valid TOML after set.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=150"], + capture_output=True, + ) + + config_file = tmp_path / "ArchiveBox.conf" + content = config_file.read_text() + + # Basic TOML validation - should have sections and key=value pairs + assert "[" in content or "=" in content + + +def test_config_updates_existing_value(tmp_path, process): + """Test that setting same key twice updates the value.""" + os.chdir(tmp_path) + + # Set initial value + subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=100"], + capture_output=True, + ) + + # Update to new value + subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=200"], + capture_output=True, + ) + + # Get current value + result = subprocess.run( + ["archivebox", "config", "--get", "TIMEOUT"], + capture_output=True, + text=True, + ) + + # Should show updated value + assert "200" in result.stdout diff --git a/archivebox/tests/test_cli_crawl.py b/archivebox/tests/test_cli_crawl.py new file mode 100644 index 0000000000..62482b108f --- /dev/null +++ b/archivebox/tests/test_cli_crawl.py @@ -0,0 +1,258 @@ +""" +Tests for archivebox crawl CLI command. + +Tests cover: +- crawl create (with URLs, from stdin, pass-through) +- crawl list (with filters) +- crawl update +- crawl delete +""" + +import json + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, +) + + +class TestCrawlCreate: + """Tests for `archivebox crawl create`.""" + + def test_create_from_url_args(self, initialized_archive): + """Create crawl from URL arguments.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "create", url], + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + assert "Created crawl" in stderr + + # Check JSONL output + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]["type"] == "Crawl" + assert url in records[0]["urls"] + + def test_create_from_stdin_urls(self, initialized_archive): + """Create crawl from stdin URLs (one per line).""" + urls = [create_test_url() for _ in range(3)] + stdin = "\n".join(urls) + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "create"], + stdin=stdin, + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + crawl = records[0] + assert crawl["type"] == "Crawl" + # All URLs should be in the crawl + for url in urls: + assert url in crawl["urls"] + + def test_create_with_depth(self, initialized_archive): + """Create crawl with --depth flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "create", "--depth=2", url], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert records[0]["max_depth"] == 2 + + def test_create_with_tag(self, initialized_archive): + """Create crawl with --tag flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "create", "--tag=test-tag", url], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert "test-tag" in records[0].get("tags_str", "") + + def test_create_pass_through_other_types(self, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"} + url = create_test_url() + stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url}) + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "create"], + stdin=stdin, + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + # Should have both the passed-through Tag and the new Crawl + types = [r.get("type") for r in records] + assert "Tag" in types + assert "Crawl" in types + + def test_create_pass_through_existing_crawl(self, initialized_archive): + """Existing Crawl records (with id) are passed through.""" + # First create a crawl + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + # Now pipe it back - should pass through + stdout2, stderr, code = run_archivebox_cmd( + ["crawl", "create"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) == 1 + assert records[0]["id"] == crawl["id"] + + +class TestCrawlList: + """Tests for `archivebox crawl list`.""" + + def test_list_empty(self, initialized_archive): + """List with no crawls returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "list"], + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Listed 0 crawls" in stderr + + def test_list_returns_created(self, initialized_archive): + """List returns previously created crawls.""" + url = create_test_url() + run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "list"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(url in r.get("urls", "") for r in records) + + def test_list_filter_by_status(self, initialized_archive): + """Filter crawls by status.""" + url = create_test_url() + run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "list", "--status=queued"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + # Create multiple crawls + for _ in range(3): + run_archivebox_cmd(["crawl", "create", create_test_url()], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "list", "--limit=2"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestCrawlUpdate: + """Tests for `archivebox crawl update`.""" + + def test_update_status(self, initialized_archive): + """Update crawl status.""" + # Create a crawl + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + # Update it + stdout2, stderr, code = run_archivebox_cmd( + ["crawl", "update", "--status=started"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Updated 1 crawls" in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]["status"] == "started" + + +class TestCrawlDelete: + """Tests for `archivebox crawl delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "delete"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "delete", "--yes"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Deleted 1 crawls" in stderr + + def test_delete_dry_run(self, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["crawl", "delete", "--dry-run"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Would delete" in stderr + assert "dry run" in stderr.lower() diff --git a/archivebox/tests/test_cli_extract.py b/archivebox/tests/test_cli_extract.py new file mode 100644 index 0000000000..0d1e5b0026 --- /dev/null +++ b/archivebox/tests/test_cli_extract.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox extract command. +Verify extract re-runs extractors on existing snapshots. +""" + +import os +import sqlite3 +import subprocess + + +def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict): + """Test that extract command runs on existing snapshots.""" + os.chdir(tmp_path) + + # Add a snapshot first + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Run extract + result = subprocess.run( + ["archivebox", "extract"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Should complete + assert result.returncode in [0, 1] + + +def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_dict): + """Test that extract doesn't change snapshot count.""" + os.chdir(tmp_path) + + # Add snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + # Run extract + subprocess.run( + ["archivebox", "extract", "--overwrite"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert count_after == count_before diff --git a/archivebox/tests/test_cli_extract_input.py b/archivebox/tests/test_cli_extract_input.py new file mode 100644 index 0000000000..481d6b858b --- /dev/null +++ b/archivebox/tests/test_cli_extract_input.py @@ -0,0 +1,273 @@ +"""Tests for archivebox extract input handling and pipelines.""" + +import os +import subprocess +import sqlite3 +import json + + +def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict): + """Test that extract command accepts a snapshot ID.""" + os.chdir(tmp_path) + + # First create a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get the snapshot ID + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + # Run extract on the snapshot + result = subprocess.run( + ["archivebox", "extract", "--no-wait", str(snapshot_id)], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + # Should not error about invalid snapshot ID + assert "not found" not in result.stderr.lower() + + +def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process, disable_extractors_dict): + """Test that extract creates ArchiveResult when extractor is enabled.""" + os.chdir(tmp_path) + + # First create a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get the snapshot ID + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + # Run extract with title extractor enabled + env = disable_extractors_dict.copy() + env["SAVE_TITLE"] = "true" + + subprocess.run( + ["archivebox", "extract", "--no-wait", str(snapshot_id)], + capture_output=True, + text=True, + env=env, + ) + + # Check for archiveresults (may be queued, not completed with --no-wait) + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute( + "SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?", + (snapshot_id,), + ).fetchone()[0] + conn.close() + + # May or may not have results depending on timing + assert count >= 0 + + +def test_extract_plugin_option_accepted(tmp_path, process, disable_extractors_dict): + """Test that --plugin option is accepted.""" + os.chdir(tmp_path) + + # First create a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get the snapshot ID + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + result = subprocess.run( + ["archivebox", "extract", "--plugin=title", "--no-wait", str(snapshot_id)], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + assert "unrecognized arguments: --plugin" not in result.stderr + + +def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict): + """Test that extract reads snapshot IDs from stdin.""" + os.chdir(tmp_path) + + # First create a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get the snapshot ID + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + result = subprocess.run( + ["archivebox", "extract", "--no-wait"], + input=f"{snapshot_id}\n", + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + # Should not show "not found" error + assert "not found" not in result.stderr.lower() or result.returncode == 0 + + +def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict): + """Test that extract reads JSONL records from stdin.""" + os.chdir(tmp_path) + + # First create a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get the snapshot ID + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + "\n" + + result = subprocess.run( + ["archivebox", "extract", "--no-wait"], + input=jsonl_input, + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + # Should not show "not found" error + assert "not found" not in result.stderr.lower() or result.returncode == 0 + + +def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_dict): + """Test piping snapshot output to extract.""" + os.chdir(tmp_path) + + # Create snapshot and pipe to extract + snapshot_proc = subprocess.Popen( + ["archivebox", "snapshot", "https://example.com"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=disable_extractors_dict, + ) + + subprocess.run( + ["archivebox", "extract", "--no-wait"], + stdin=snapshot_proc.stdout, + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + snapshot_proc.wait() + + # Check database for snapshot + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot = c.execute( + "SELECT id, url FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone() + conn.close() + + assert snapshot is not None, "Snapshot should be created by pipeline" + + +def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict): + """Test extracting from multiple snapshots.""" + os.chdir(tmp_path) + + # Create multiple snapshots one at a time to avoid deduplication issues + subprocess.run( + ["archivebox", "add", "--index-only", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + subprocess.run( + ["archivebox", "add", "--index-only", "https://iana.org"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Get all snapshot IDs + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_ids = c.execute("SELECT id FROM core_snapshot").fetchall() + conn.close() + + assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots" + + # Extract from all snapshots + ids_input = "\n".join(str(s[0]) for s in snapshot_ids) + "\n" + result = subprocess.run( + ["archivebox", "extract", "--no-wait"], + input=ids_input, + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + assert result.returncode == 0, result.stderr + + # Should not error + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert count >= 2, "Both snapshots should still exist after extraction" + + +class TestExtractCLI: + """Test the CLI interface for extract command.""" + + def test_cli_help(self, tmp_path, process): + """Test that --help works for extract command.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "extract", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "--plugin" in result.stdout or "-p" in result.stdout + assert "--wait" in result.stdout or "--no-wait" in result.stdout + + def test_cli_no_snapshots_shows_warning(self, tmp_path, process): + """Test that running without snapshots shows a warning.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "extract", "--no-wait"], + input="", + capture_output=True, + text=True, + ) + + # Should show warning about no snapshots or exit normally (empty input) + assert result.returncode == 0 or "No" in result.stderr diff --git a/archivebox/tests/test_cli_help.py b/archivebox/tests/test_cli_help.py new file mode 100644 index 0000000000..772e2a08e2 --- /dev/null +++ b/archivebox/tests/test_cli_help.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox help command. +Verify command runs successfully and produces output. +""" + +import os +import subprocess + + +def test_help_runs_successfully(tmp_path): + """Test that help command runs and produces output.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "help"], capture_output=True, text=True) + + assert result.returncode == 0 + combined = result.stdout + result.stderr + assert len(combined) > 100 + assert "archivebox" in combined.lower() + + +def test_help_in_initialized_dir(tmp_path, process): + """Test help command in initialized data directory.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "help"], capture_output=True, text=True) + + assert result.returncode == 0 + combined = result.stdout + result.stderr + assert "init" in combined + assert "add" in combined diff --git a/archivebox/tests/test_cli_init.py b/archivebox/tests/test_cli_init.py new file mode 100644 index 0000000000..3510a08650 --- /dev/null +++ b/archivebox/tests/test_cli_init.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox init command. +Verify init creates correct database schema, filesystem structure, and config. +""" + +import os +import sqlite3 +import subprocess + +from archivebox.config.common import STORAGE_CONFIG + + +DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace("6", "7").replace("4", "5") + + +def test_init_creates_database_file(tmp_path): + """Test that init creates index.sqlite3 database file.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "init"], capture_output=True) + + assert result.returncode == 0 + db_path = tmp_path / "index.sqlite3" + assert db_path.exists() + assert db_path.is_file() + + +def test_init_creates_archive_directory(tmp_path): + """Test that init creates archive directory.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + archive_dir = tmp_path / "archive" + assert archive_dir.exists() + assert archive_dir.is_dir() + + +def test_init_creates_sources_directory(tmp_path): + """Test that init creates sources directory.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + sources_dir = tmp_path / "sources" + assert sources_dir.exists() + assert sources_dir.is_dir() + + +def test_init_creates_logs_directory(tmp_path): + """Test that init creates logs directory.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + logs_dir = tmp_path / "logs" + assert logs_dir.exists() + assert logs_dir.is_dir() + + +def test_init_creates_config_file(tmp_path): + """Test that init creates ArchiveBox.conf config file.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + config_file = tmp_path / "ArchiveBox.conf" + assert config_file.exists() + assert config_file.is_file() + + +def test_init_runs_migrations(tmp_path): + """Test that init runs Django migrations and creates core tables.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + # Check that migrations were applied + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check django_migrations table exists + migrations = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'", + ).fetchall() + assert len(migrations) == 1 + + # Check that some migrations were applied + migration_count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0] + assert migration_count > 0 + + conn.close() + + +def test_init_creates_core_snapshot_table(tmp_path): + """Test that init creates core_snapshot table.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check core_snapshot table exists + tables = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'", + ).fetchall() + assert len(tables) == 1 + + conn.close() + + +def test_init_creates_crawls_crawl_table(tmp_path): + """Test that init creates crawls_crawl table.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check crawls_crawl table exists + tables = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'", + ).fetchall() + assert len(tables) == 1 + + conn.close() + + +def test_init_creates_core_archiveresult_table(tmp_path): + """Test that init creates core_archiveresult table.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check core_archiveresult table exists + tables = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'", + ).fetchall() + assert len(tables) == 1 + + conn.close() + + +def test_init_sets_correct_file_permissions(tmp_path): + """Test that init sets correct permissions on created files.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + # Check database permissions + db_path = tmp_path / "index.sqlite3" + assert oct(db_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + + # Check directory permissions + archive_dir = tmp_path / "archive" + assert oct(archive_dir.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + + +def test_init_is_idempotent(tmp_path): + """Test that running init multiple times is safe (idempotent).""" + os.chdir(tmp_path) + + # First init + result1 = subprocess.run(["archivebox", "init"], capture_output=True, text=True) + assert result1.returncode == 0 + assert "Initializing a new ArchiveBox" in result1.stdout + + # Second init should update, not fail + result2 = subprocess.run(["archivebox", "init"], capture_output=True, text=True) + assert result2.returncode == 0 + assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower() + + # Database should still be valid + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0] + assert count > 0 + conn.close() + + +def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_extractors_dict): + """Test that re-running init preserves existing snapshot data.""" + os.chdir(tmp_path) + + # Add a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Check snapshot was created + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + assert count_before == 1 + conn.close() + + # Run init again + result = subprocess.run(["archivebox", "init"], capture_output=True) + assert result.returncode == 0 + + # Snapshot should still exist + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + assert count_after == count_before + conn.close() + + +def test_init_quick_flag_skips_checks(tmp_path): + """Test that init --quick runs faster by skipping some checks.""" + os.chdir(tmp_path) + + result = subprocess.run(["archivebox", "init", "--quick"], capture_output=True, text=True) + + assert result.returncode == 0 + # Database should still be created + db_path = tmp_path / "index.sqlite3" + assert db_path.exists() + + +def test_init_creates_machine_table(tmp_path): + """Test that init creates the machine_machine table.""" + os.chdir(tmp_path) + subprocess.run(["archivebox", "init"], capture_output=True) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check machine_machine table exists + tables = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'", + ).fetchall() + conn.close() + + assert len(tables) == 1 + + +def test_init_output_shows_collection_info(tmp_path): + """Test that init output shows helpful collection information.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "init"], capture_output=True, text=True) + + output = result.stdout + # Should show some helpful info about the collection + assert "ArchiveBox" in output or "collection" in output.lower() or "Initializing" in output + + +def test_init_ignores_unrecognized_archive_directories(tmp_path, process, disable_extractors_dict): + """Test that init upgrades existing dirs without choking on extra folders.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + (tmp_path / "archive" / "some_random_folder").mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + ["archivebox", "init"], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0, result.stdout + result.stderr diff --git a/archivebox/tests/test_cli_install.py b/archivebox/tests/test_cli_install.py new file mode 100644 index 0000000000..3b0057a2f9 --- /dev/null +++ b/archivebox/tests/test_cli_install.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox install command. +Verify install detects and records binary dependencies in DB. +""" + +import os +import sqlite3 +import subprocess +from pathlib import Path + + +def test_install_runs_successfully(tmp_path, process): + """Test that install command runs without error.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + timeout=60, + ) + + # Dry run should complete quickly + assert result.returncode in [0, 1] # May return 1 if binaries missing + + +def test_install_creates_binary_records_in_db(tmp_path, process): + """Test that install creates Binary records in database.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + timeout=60, + ) + + # Check that binary records were created + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Check machine_binary table exists + tables = c.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'", + ).fetchall() + conn.close() + + assert len(tables) == 1 + + +def test_install_dry_run_does_not_install(tmp_path, process): + """Test that --dry-run doesn't actually install anything.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + timeout=60, + ) + + # Should complete without actually installing + assert "dry" in result.stdout.lower() or result.returncode in [0, 1] + + +def test_install_detects_system_binaries(tmp_path, process): + """Test that install detects existing system binaries.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + timeout=60, + ) + + # Should detect at least some common binaries (python, curl, etc) + assert result.returncode in [0, 1] + + +def test_install_shows_binary_status(tmp_path, process): + """Test that install shows status of binaries.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + timeout=60, + ) + + output = result.stdout + result.stderr + # Should show some binary information + assert len(output) > 50 + + +def test_install_dry_run_prints_dry_run_message(tmp_path, process): + """Test that install --dry-run clearly reports that no changes will be made.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + timeout=60, + ) + + assert result.returncode == 0 + assert "dry run" in result.stdout.lower() + + +def test_install_help_lists_dry_run_flag(tmp_path): + """Test that install --help documents the dry-run option.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "install", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "--dry-run" in result.stdout or "-d" in result.stdout + + +def test_install_invalid_option_fails(tmp_path): + """Test that invalid install options fail cleanly.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "install", "--invalid-option"], + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + + +def test_install_from_empty_dir_initializes_collection(tmp_path): + """Test that install bootstraps an empty dir before performing work.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "install", "--dry-run"], + capture_output=True, + text=True, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0 + assert "Initializing" in output or "Dry run" in output or "init" in output.lower() + + +def test_install_updates_binary_table(tmp_path, process): + """Test that install completes and only mutates dependency state.""" + os.chdir(tmp_path) + env = os.environ.copy() + tmp_short = Path("/tmp") / f"abx-install-{tmp_path.name}" + tmp_short.mkdir(parents=True, exist_ok=True) + env.update( + { + "TMP_DIR": str(tmp_short), + "ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true", + }, + ) + + result = subprocess.run( + ["archivebox", "install"], + capture_output=True, + text=True, + timeout=420, + env=env, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + binary_counts = dict( + c.execute( + "SELECT status, COUNT(*) FROM machine_binary GROUP BY status", + ).fetchall(), + ) + snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + sealed_crawls = c.execute( + "SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'", + ).fetchone()[0] + conn.close() + + assert sealed_crawls == 0 + assert snapshot_count == 0 + assert binary_counts.get("installed", 0) > 0 diff --git a/archivebox/tests/test_cli_list.py b/archivebox/tests/test_cli_list.py new file mode 100644 index 0000000000..927b2b38a6 --- /dev/null +++ b/archivebox/tests/test_cli_list.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox list command. +Verify list emits snapshot JSONL and applies the documented filters. +""" + +import json +import os +import sqlite3 +import subprocess + + +def _parse_jsonl(stdout: str) -> list[dict]: + return [json.loads(line) for line in stdout.splitlines() if line.strip().startswith("{")] + + +def test_list_outputs_existing_snapshots_as_jsonl(tmp_path, process, disable_extractors_dict): + """Test that list prints one JSON object per stored snapshot.""" + os.chdir(tmp_path) + for url in ["https://example.com", "https://iana.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "list"], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + urls = {row["url"] for row in rows} + + assert result.returncode == 0, result.stderr + assert "https://example.com" in urls + assert "https://iana.org" in urls + + +def test_list_filters_by_url_icontains(tmp_path, process, disable_extractors_dict): + """Test that list --url__icontains returns only matching snapshots.""" + os.chdir(tmp_path) + for url in ["https://example.com", "https://iana.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "list", "--url__icontains", "example.com"], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["url"] == "https://example.com" + + +def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractors_dict): + """Test that crawl-id and limit filters constrain the result set.""" + os.chdir(tmp_path) + for url in ["https://example.com", "https://iana.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + crawl_id = str( + c.execute( + "SELECT crawl_id FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone()[0], + ) + conn.close() + + result = subprocess.run( + ["archivebox", "list", "--crawl-id", crawl_id, "--limit", "1"], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["crawl_id"].replace("-", "") == crawl_id.replace("-", "") + assert rows[0]["url"] == "https://example.com" + + +def test_list_filters_by_status(tmp_path, process, disable_extractors_dict): + """Test that list can filter using the current snapshot status.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + status = c.execute("SELECT status FROM core_snapshot LIMIT 1").fetchone()[0] + conn.close() + + result = subprocess.run( + ["archivebox", "list", "--status", status], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["status"] == status + + +def test_list_help_lists_filter_options(tmp_path, process): + """Test that list --help documents the supported filter flags.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "list", "--help"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + assert "--url__icontains" in result.stdout + assert "--crawl-id" in result.stdout + assert "--limit" in result.stdout + assert "--search" in result.stdout + + +def test_list_allows_sort_with_limit(tmp_path, process, disable_extractors_dict): + """Test that list can sort and then apply limit without queryset slicing errors.""" + os.chdir(tmp_path) + for url in ["https://example.com", "https://iana.org", "https://example.net"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "list", "--limit", "2", "--sort", "-created_at"], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 2 + + +def test_list_search_meta_matches_metadata(tmp_path, process, disable_extractors_dict): + """Test that list --search=meta applies metadata search to the queryset.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "list", "--search=meta", "example.com"], + capture_output=True, + text=True, + timeout=30, + ) + + rows = _parse_jsonl(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["url"] == "https://example.com" diff --git a/archivebox/tests/test_cli_manage.py b/archivebox/tests/test_cli_manage.py new file mode 100644 index 0000000000..9634b63292 --- /dev/null +++ b/archivebox/tests/test_cli_manage.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox manage command. +Verify manage command runs Django management commands. +""" + +import os +import subprocess + + +def test_manage_help_works(tmp_path, process): + """Test that manage help command works.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "manage", "help"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + assert len(result.stdout) > 100 + + +def test_manage_showmigrations_works(tmp_path, process): + """Test that manage showmigrations works.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "manage", "showmigrations"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + # Should show migration status + assert "core" in result.stdout or "[" in result.stdout + + +def test_manage_dbshell_command_exists(tmp_path, process): + """Test that manage dbshell command is recognized.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "manage", "help", "dbshell"], + capture_output=True, + text=True, + timeout=30, + ) + + # Should show help for dbshell + assert result.returncode == 0 + assert "dbshell" in result.stdout or "database" in result.stdout.lower() + + +def test_manage_check_works(tmp_path, process): + """Test that manage check works.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "manage", "check"], + capture_output=True, + text=True, + timeout=30, + ) + + # Check should complete + assert result.returncode in [0, 1] diff --git a/archivebox/tests/test_cli_piping.py b/archivebox/tests/test_cli_piping.py new file mode 100644 index 0000000000..721f82b387 --- /dev/null +++ b/archivebox/tests/test_cli_piping.py @@ -0,0 +1,408 @@ +""" +Tests for JSONL piping contracts and `archivebox run`. + +This file covers both: +- low-level JSONL/stdin parsing behavior that makes CLI piping work +- subprocess integration for the supported records `archivebox run` consumes +""" + +import sqlite3 +import sys +import uuid +from io import StringIO +from pathlib import Path + +from archivebox.tests.conftest import ( + create_test_url, + parse_jsonl_output, + run_archivebox_cmd, +) + + +PIPE_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", +} + + +class MockTTYStringIO(StringIO): + def __init__(self, initial_value: str = "", *, is_tty: bool): + super().__init__(initial_value) + self._is_tty = is_tty + + def isatty(self) -> bool: + return self._is_tty + + +def _stdout_lines(stdout: str) -> list[str]: + return [line for line in stdout.splitlines() if line.strip()] + + +def _assert_stdout_is_jsonl_only(stdout: str) -> None: + lines = _stdout_lines(stdout) + assert lines, "Expected stdout to contain JSONL records" + assert all(line.lstrip().startswith("{") for line in lines), stdout + + +def _sqlite_param(value: object) -> object: + if not isinstance(value, str): + return value + try: + return uuid.UUID(value).hex + except ValueError: + return value + + +def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None: + conn = sqlite3.connect(data_dir / "index.sqlite3") + try: + row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone() + finally: + conn.close() + return row[0] if row else None + + +def test_parse_line_accepts_supported_piping_inputs(): + """The JSONL parser should normalize the input forms CLI pipes accept.""" + from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line + + assert parse_line("") is None + assert parse_line(" ") is None + assert parse_line("# comment") is None + assert parse_line("not-a-url") is None + assert parse_line("ftp://example.com") is None + + plain_url = parse_line("https://example.com") + assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"} + + file_url = parse_line("file:///tmp/example.txt") + assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"} + + snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}') + assert snapshot_json is not None + assert snapshot_json["type"] == TYPE_SNAPSHOT + assert snapshot_json["tags"] == "tag1,tag2" + + crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}') + assert crawl_json is not None + assert crawl_json["type"] == TYPE_CRAWL + assert crawl_json["id"] == "abc123" + assert crawl_json["max_depth"] == 1 + + snapshot_id = "01234567-89ab-cdef-0123-456789abcdef" + parsed_id = parse_line(snapshot_id) + assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id} + + compact_snapshot_id = "0123456789abcdef0123456789abcdef" + compact_parsed_id = parse_line(compact_snapshot_id) + assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id} + + +def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl(): + """Piping helpers should consume args, structured JSONL, and pass-through records.""" + from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin + + records = list(read_args_or_stdin(("https://example1.com", "https://example2.com"))) + assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"] + + stdin_records = list( + read_args_or_stdin( + (), + stream=MockTTYStringIO( + "https://plain-url.com\n" + '{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n' + '{"type":"Tag","id":"tag-1","name":"example"}\n' + "01234567-89ab-cdef-0123-456789abcdef\n" + "not valid json\n", + is_tty=False, + ), + ), + ) + assert len(stdin_records) == 4 + assert stdin_records[0]["url"] == "https://plain-url.com" + assert stdin_records[1]["url"] == "https://jsonl-url.com" + assert stdin_records[1]["tags"] == "test" + assert stdin_records[2]["type"] == "Tag" + assert stdin_records[2]["name"] == "example" + assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef" + + crawl_records = list( + read_args_or_stdin( + (), + stream=MockTTYStringIO( + '{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n', + is_tty=False, + ), + ), + ) + assert len(crawl_records) == 1 + assert crawl_records[0]["type"] == TYPE_CRAWL + assert crawl_records[0]["id"] == "crawl-1" + + tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True))) + assert tty_records == [] + + +def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path): + """Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping.""" + from archivebox.hooks import collect_urls_from_plugins + + (tmp_path / "wget").mkdir() + (tmp_path / "wget" / "urls.jsonl").write_text( + '{"url":"https://wget-link-1.com"}\n{"url":"https://wget-link-2.com"}\n', + encoding="utf-8", + ) + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + '{"url":"https://html-link-1.com"}\n{"url":"https://html-link-2.com","title":"HTML Link 2"}\n', + encoding="utf-8", + ) + (tmp_path / "screenshot").mkdir() + + urls = collect_urls_from_plugins(tmp_path) + assert len(urls) == 4 + assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"} + titled = [url for url in urls if url.get("title") == "HTML Link 2"] + assert len(titled) == 1 + assert titled[0]["url"] == "https://html-link-2.com" + + assert collect_urls_from_plugins(tmp_path / "nonexistent") == [] + + +def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path): + from archivebox.hooks import collect_urls_from_plugins + + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + '{"url":"https://docs.sweeting.me/s/youtube-favorites)**"}\n', + encoding="utf-8", + ) + + urls = collect_urls_from_plugins(tmp_path) + assert len(urls) == 1 + assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites" + + +def test_collect_urls_from_plugins_trims_trailing_punctuation(tmp_path): + from archivebox.hooks import collect_urls_from_plugins + + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + ('{"url":"https://github.com/ArchiveBox/ArchiveBox."}\n{"url":"https://github.com/abc?abc#234234?."}\n'), + encoding="utf-8", + ) + + urls = collect_urls_from_plugins(tmp_path) + assert [url["url"] for url in urls] == [ + "https://github.com/ArchiveBox/ArchiveBox", + "https://github.com/abc?abc#234234", + ] + + +def test_crawl_create_stdout_pipes_into_run(initialized_archive): + """`archivebox crawl create | archivebox run` should queue and materialize snapshots.""" + url = create_test_url() + + create_stdout, create_stderr, create_code = run_archivebox_cmd( + ["crawl", "create", url], + data_dir=initialized_archive, + ) + assert create_code == 0, create_stderr + _assert_stdout_is_jsonl_only(create_stdout) + + crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl") + + run_stdout, run_stderr, run_code = run_archivebox_cmd( + ["run"], + stdin=create_stdout, + data_dir=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + ) + assert run_code == 0, run_stderr + _assert_stdout_is_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records) + + snapshot_count = _db_value( + initialized_archive, + "SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?", + (crawl["id"],), + ) + assert isinstance(snapshot_count, int) + assert snapshot_count >= 1 + + +def test_snapshot_list_stdout_pipes_into_run(initialized_archive): + """`archivebox snapshot list | archivebox run` should requeue listed snapshots.""" + url = create_test_url() + + create_stdout, create_stderr, create_code = run_archivebox_cmd( + ["snapshot", "create", url], + data_dir=initialized_archive, + ) + assert create_code == 0, create_stderr + snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot") + + list_stdout, list_stderr, list_code = run_archivebox_cmd( + ["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"], + data_dir=initialized_archive, + ) + if list_code != 0 or not parse_jsonl_output(list_stdout): + list_stdout, list_stderr, list_code = run_archivebox_cmd( + ["snapshot", "list", f"--url__icontains={url}"], + data_dir=initialized_archive, + ) + assert list_code == 0, list_stderr + _assert_stdout_is_jsonl_only(list_stdout) + + run_stdout, run_stderr, run_code = run_archivebox_cmd( + ["run"], + stdin=list_stdout, + data_dir=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + ) + assert run_code == 0, run_stderr + _assert_stdout_is_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records) + + snapshot_status = _db_value( + initialized_archive, + "SELECT status FROM core_snapshot WHERE id = ?", + (snapshot["id"],), + ) + assert snapshot_status == "sealed" + + +def test_archiveresult_list_stdout_pipes_into_run(initialized_archive): + """`archivebox archiveresult list | archivebox run` should preserve clean JSONL stdout.""" + url = create_test_url() + + snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd( + ["snapshot", "create", url], + data_dir=initialized_archive, + ) + assert snapshot_code == 0, snapshot_stderr + + ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=snapshot_stdout, + data_dir=initialized_archive, + ) + assert ar_create_code == 0, ar_create_stderr + + run_archivebox_cmd( + ["run"], + stdin=ar_create_stdout, + data_dir=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + ) + + list_stdout, list_stderr, list_code = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + data_dir=initialized_archive, + ) + assert list_code == 0, list_stderr + _assert_stdout_is_jsonl_only(list_stdout) + listed_records = parse_jsonl_output(list_stdout) + archiveresult = next(record for record in listed_records if record.get("type") == "ArchiveResult") + + run_stdout, run_stderr, run_code = run_archivebox_cmd( + ["run"], + stdin=list_stdout, + data_dir=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + ) + assert run_code == 0, run_stderr + _assert_stdout_is_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"] for record in run_records) + + +def test_binary_create_stdout_pipes_into_run(initialized_archive): + """`archivebox binary create | archivebox run` should queue the binary record for processing.""" + create_stdout, create_stderr, create_code = run_archivebox_cmd( + ["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"], + data_dir=initialized_archive, + ) + assert create_code == 0, create_stderr + _assert_stdout_is_jsonl_only(create_stdout) + + binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") in {"BinaryRequest", "Binary"}) + + run_stdout, run_stderr, run_code = run_archivebox_cmd( + ["run"], + stdin=create_stdout, + data_dir=initialized_archive, + timeout=120, + ) + assert run_code == 0, run_stderr + _assert_stdout_is_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") in {"BinaryRequest", "Binary"} and record.get("id") == binary["id"] for record in run_records) + + status = _db_value( + initialized_archive, + "SELECT status FROM machine_binary WHERE id = ?", + (binary["id"],), + ) + assert status in {"queued", "installed"} + + +def test_multi_stage_pipeline_into_run(initialized_archive): + """`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work.""" + url = create_test_url() + + crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd( + ["crawl", "create", url], + data_dir=initialized_archive, + ) + assert crawl_code == 0, crawl_stderr + _assert_stdout_is_jsonl_only(crawl_stdout) + + snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd( + ["snapshot", "create"], + stdin=crawl_stdout, + data_dir=initialized_archive, + ) + assert snapshot_code == 0, snapshot_stderr + _assert_stdout_is_jsonl_only(snapshot_stdout) + + archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=snapshot_stdout, + data_dir=initialized_archive, + ) + assert archiveresult_code == 0, archiveresult_stderr + _assert_stdout_is_jsonl_only(archiveresult_stdout) + + run_stdout, run_stderr, run_code = run_archivebox_cmd( + ["run"], + stdin=archiveresult_stdout, + data_dir=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + ) + assert run_code == 0, run_stderr + _assert_stdout_is_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + snapshot = next(record for record in run_records if record.get("type") == "Snapshot") + assert any(record.get("type") == "ArchiveResult" for record in run_records) + + snapshot_status = _db_value( + initialized_archive, + "SELECT status FROM core_snapshot WHERE id = ?", + (snapshot["id"],), + ) + assert snapshot_status == "sealed" diff --git a/archivebox/tests/test_cli_remove.py b/archivebox/tests/test_cli_remove.py new file mode 100644 index 0000000000..fd3da0de37 --- /dev/null +++ b/archivebox/tests/test_cli_remove.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox remove command. +Verify remove deletes snapshots from DB and filesystem. +""" + +import os +import sqlite3 +import subprocess +from pathlib import Path + + +def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: + candidates = {snapshot_id} + if len(snapshot_id) == 32: + candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}") + elif len(snapshot_id) == 36 and "-" in snapshot_id: + candidates.add(snapshot_id.replace("-", "")) + + for needle in candidates: + for path in data_dir.rglob(needle): + if path.is_dir(): + return path + return None + + +def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict): + """Test that remove command deletes snapshot from database.""" + os.chdir(tmp_path) + + # Add a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Verify it exists + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + assert count_before == 1 + + # Remove it + subprocess.run( + ["archivebox", "remove", "https://example.com", "--yes"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Verify it's gone + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert count_after == 0 + + +def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_dict): + """Test that remove --delete removes the current snapshot output directory.""" + os.chdir(tmp_path) + + # Add a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = str(c.execute("SELECT id FROM core_snapshot").fetchone()[0]) + conn.close() + + snapshot_dir = _find_snapshot_dir(tmp_path, snapshot_id) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + + subprocess.run( + ["archivebox", "remove", "https://example.com", "--yes", "--delete"], + capture_output=True, + env=disable_extractors_dict, + ) + + assert not snapshot_dir.exists() + + +def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractors_dict): + """Test that --yes flag skips confirmation prompt.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Remove with --yes should complete without interaction + result = subprocess.run( + ["archivebox", "remove", "https://example.com", "--yes"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + assert result.returncode == 0 + output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8") + assert "Index now contains 0 links." in output + + +def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict): + """Test removing multiple snapshots at once.""" + os.chdir(tmp_path) + + # Add multiple snapshots + for url in ["https://example.com", "https://example.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + ) + + # Verify both exist + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + assert count_before == 2 + + # Remove both + subprocess.run( + ["archivebox", "remove", "https://example.com", "https://example.org", "--yes"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Verify both are gone + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert count_after == 0 + + +def test_remove_with_filter(tmp_path, process, disable_extractors_dict): + """Test removing snapshots using filter.""" + os.chdir(tmp_path) + + # Add snapshots + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Remove using filter + result = subprocess.run( + ["archivebox", "remove", "--filter-type=search", "--filter=example.com", "--yes"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Should complete (exit code depends on implementation) + assert result.returncode in [0, 1, 2] + + +def test_remove_with_regex_filter_deletes_all_matches(tmp_path, process, disable_extractors_dict): + """Test regex filters remove every matching snapshot.""" + os.chdir(tmp_path) + + for url in ["https://example.com", "https://iana.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "remove", "--filter-type=regex", ".*", "--yes"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8") + assert count_after == 0 + assert "Removed" in output or "Found" in output + + +def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extractors_dict): + """Test that removing non-existent URL fails gracefully.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "remove", "https://nonexistent-url-12345.com", "--yes"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Should fail or show error + stdout_text = result.stdout.decode("utf-8", errors="replace").lower() + assert result.returncode != 0 or "not found" in stdout_text or "no matches" in stdout_text + + +def test_remove_reports_remaining_link_count_correctly(tmp_path, process, disable_extractors_dict): + """Test remove reports the remaining snapshot count after deletion.""" + os.chdir(tmp_path) + + for url in ["https://example.com", "https://example.org"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "remove", "https://example.org", "--yes"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8") + assert "Removed 1 out of 2 links" in output + assert "Index now contains 1 links." in output + + +def test_remove_after_flag(tmp_path, process, disable_extractors_dict): + """Test remove --after flag removes snapshots after date.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Try remove with --after flag (should work or show usage) + result = subprocess.run( + ["archivebox", "remove", "--after=2020-01-01", "--yes"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Should complete + assert result.returncode in [0, 1, 2] diff --git a/archivebox/tests/test_cli_run.py b/archivebox/tests/test_cli_run.py new file mode 100644 index 0000000000..8fa0c887f7 --- /dev/null +++ b/archivebox/tests/test_cli_run.py @@ -0,0 +1,449 @@ +""" +Tests for archivebox run CLI command. + +Tests cover: +- run with stdin JSONL (Crawl, Snapshot, ArchiveResult) +- create-or-update behavior (records with/without id) +- pass-through output (for chaining) +""" + +import json +import sys + +import pytest + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, + create_test_crawl_json, + create_test_snapshot_json, +) + +RUN_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", +} + + +class TestRunWithCrawl: + """Tests for `archivebox run` with Crawl input.""" + + def test_run_with_new_crawl(self, initialized_archive): + """Run creates and processes a new Crawl (no id).""" + crawl_record = create_test_crawl_json() + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl_record), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0, f"Command failed: {stderr}" + + # Should output the created Crawl + records = parse_jsonl_output(stdout) + crawl_records = [r for r in records if r.get("type") == "Crawl"] + assert len(crawl_records) >= 1 + assert crawl_records[0].get("id") # Should have an id now + + def test_run_with_existing_crawl(self, initialized_archive): + """Run re-queues an existing Crawl (with id).""" + url = create_test_url() + + # First create a crawl + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV) + crawl = parse_jsonl_output(stdout1)[0] + + # Run with the existing crawl + stdout2, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + +class TestRunWithSnapshot: + """Tests for `archivebox run` with Snapshot input.""" + + def test_run_with_new_snapshot(self, initialized_archive): + """Run creates and processes a new Snapshot (no id, just url).""" + snapshot_record = create_test_snapshot_json() + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(snapshot_record), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + snapshot_records = [r for r in records if r.get("type") == "Snapshot"] + assert len(snapshot_records) >= 1 + assert snapshot_records[0].get("id") + + def test_run_with_existing_snapshot(self, initialized_archive): + """Run re-queues an existing Snapshot (with id).""" + url = create_test_url() + + # First create a snapshot + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV) + snapshot = parse_jsonl_output(stdout1)[0] + + # Run with the existing snapshot + stdout2, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + def test_run_with_plain_url(self, initialized_archive): + """Run accepts plain URL records (no type field).""" + url = create_test_url() + url_record = {"url": url} + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(url_record), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + + +class TestRunWithArchiveResult: + """Tests for `archivebox run` with ArchiveResult input.""" + + def test_run_requeues_failed_archiveresult(self, initialized_archive): + """Run re-queues a failed ArchiveResult.""" + url = create_test_url() + + # Create snapshot and archive result + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + env=RUN_TEST_ENV, + ) + ar = next(r for r in parse_jsonl_output(stdout2) if r.get("type") == "ArchiveResult") + + # Update to failed + ar["status"] = "failed" + run_archivebox_cmd( + ["archiveresult", "update", "--status=failed"], + stdin=json.dumps(ar), + data_dir=initialized_archive, + env=RUN_TEST_ENV, + ) + + # Now run should re-queue it + stdout3, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(ar), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout3) + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + assert len(ar_records) >= 1 + + +class TestRunPassThrough: + """Tests for pass-through behavior in `archivebox run`.""" + + def test_run_passes_through_unknown_types(self, initialized_archive): + """Run passes through records with unknown types.""" + unknown_record = {"type": "Unknown", "id": "fake-id", "data": "test"} + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(unknown_record), + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + unknown_records = [r for r in records if r.get("type") == "Unknown"] + assert len(unknown_records) == 1 + assert unknown_records[0]["data"] == "test" + + def test_run_outputs_all_processed_records(self, initialized_archive): + """Run outputs all processed records for chaining.""" + url = create_test_url() + crawl_record = create_test_crawl_json(urls=[url]) + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl_record), + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + # Should have at least the Crawl in output + assert len(records) >= 1 + + +class TestRunMixedInput: + """Tests for `archivebox run` with mixed record types.""" + + def test_run_handles_mixed_types(self, initialized_archive): + """Run handles mixed Crawl/Snapshot/ArchiveResult input.""" + crawl = create_test_crawl_json() + snapshot = create_test_snapshot_json() + unknown = {"type": "Tag", "id": "fake", "name": "test"} + + stdin = "\n".join( + [ + json.dumps(crawl), + json.dumps(snapshot), + json.dumps(unknown), + ], + ) + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=stdin, + data_dir=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = {r.get("type") for r in records} + # Should have processed Crawl and Snapshot, passed through Tag + assert "Crawl" in types or "Snapshot" in types or "Tag" in types + + +class TestRunEmpty: + """Tests for `archivebox run` edge cases.""" + + def test_run_empty_stdin(self, initialized_archive): + """Run with empty stdin returns success.""" + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin="", + data_dir=initialized_archive, + ) + + assert code == 0 + + def test_run_no_records_to_process(self, initialized_archive): + """Run with only pass-through records shows message.""" + unknown = {"type": "Unknown", "id": "fake"} + + stdout, stderr, code = run_archivebox_cmd( + ["run"], + stdin=json.dumps(unknown), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "No records to process" in stderr + + +class TestRunDaemonMode: + def test_run_daemon_processes_stdin_before_runner(self, monkeypatch): + from archivebox.cli import archivebox_run + + class FakeStdin: + def isatty(self): + return False + + monkeypatch.setattr(sys, "stdin", FakeStdin()) + calls = [] + monkeypatch.setattr( + archivebox_run, + "process_stdin_records", + lambda: calls.append("stdin") or 0, + ) + monkeypatch.setattr( + archivebox_run, + "run_runner", + lambda daemon=False: calls.append(f"runner:{daemon}") or 0, + ) + + with pytest.raises(SystemExit) as exit_info: + archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None) + + assert exit_info.value.code == 0 + assert calls == ["stdin", "runner:True"] + + def test_run_daemon_skips_runner_if_stdin_processing_fails(self, monkeypatch): + from archivebox.cli import archivebox_run + + class FakeStdin: + def isatty(self): + return False + + monkeypatch.setattr(sys, "stdin", FakeStdin()) + monkeypatch.setattr(archivebox_run, "process_stdin_records", lambda: 1) + monkeypatch.setattr( + archivebox_run, + "run_runner", + lambda daemon=False: (_ for _ in ()).throw(AssertionError("runner should not start after stdin failure")), + ) + + with pytest.raises(SystemExit) as exit_info: + archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None) + + assert exit_info.value.code == 1 + + +@pytest.mark.django_db +class TestRecoverOrphanedCrawls: + def test_recover_orphaned_crawl_requeues_started_crawl_without_active_processes(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import recover_orphaned_crawls + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + + recovered = recover_orphaned_crawls() + + crawl.refresh_from_db() + assert recovered == 1 + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + def test_recover_orphaned_crawl_skips_active_child_processes(self): + import archivebox.machine.models as machine_models + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.machine.models import Machine, Process + from archivebox.services.runner import recover_orphaned_crawls + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + + machine_models._CURRENT_MACHINE = None + machine = Machine.current() + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pwd=str(snapshot.output_dir / "chrome"), + cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"], + started_at=timezone.now(), + ) + + recovered = recover_orphaned_crawls() + + crawl.refresh_from_db() + assert recovered == 0 + assert crawl.retry_at is None + + def test_recover_orphaned_crawl_seals_when_all_snapshots_are_already_sealed(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import recover_orphaned_crawls + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + recovered = recover_orphaned_crawls() + + crawl.refresh_from_db() + assert recovered == 1 + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + +@pytest.mark.django_db +class TestRecoverOrphanedSnapshots: + def test_recover_orphaned_snapshot_requeues_started_snapshot_without_active_processes(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import recover_orphaned_snapshots + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + + recovered = recover_orphaned_snapshots() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert recovered == 1 + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is not None diff --git a/archivebox/tests/test_cli_schedule.py b/archivebox/tests/test_cli_schedule.py new file mode 100644 index 0000000000..1922312e3f --- /dev/null +++ b/archivebox/tests/test_cli_schedule.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""CLI-specific tests for archivebox schedule.""" + +import os +import sqlite3 +import subprocess + + +def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict): + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "schedule", "--every=daily", "--depth=0", "https://example.com"], + capture_output=True, + text=True, + check=True, + ) + + result = subprocess.run( + ["archivebox", "schedule", "--run-all"], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + + assert result.returncode == 0 + assert "Enqueued 1 scheduled crawl" in result.stdout + + conn = sqlite3.connect(tmp_path / "index.sqlite3") + try: + crawl_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] + queued_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl WHERE status = 'queued'").fetchone()[0] + finally: + conn.close() + + assert crawl_count >= 2 + assert queued_count >= 1 + + +def test_schedule_without_import_path_creates_maintenance_schedule(tmp_path, process): + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "schedule", "--every=day"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "Created scheduled maintenance update" in result.stdout + + conn = sqlite3.connect(tmp_path / "index.sqlite3") + try: + row = conn.execute( + "SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1", + ).fetchone() + finally: + conn.close() + + assert row == ("archivebox://update", "sealed") diff --git a/archivebox/tests/test_cli_search.py b/archivebox/tests/test_cli_search.py new file mode 100644 index 0000000000..71b2e213f9 --- /dev/null +++ b/archivebox/tests/test_cli_search.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox search command. +Verify search queries snapshots from DB. +""" + +import json +import os +import subprocess + + +def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict): + """Test that search command finds matching snapshots.""" + os.chdir(tmp_path) + + # Add snapshots + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Search for it + result = subprocess.run( + ["archivebox", "search", "example"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + assert "example" in result.stdout + + +def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_extractors_dict): + """Test search returns empty for non-existent term.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + result = subprocess.run( + ["archivebox", "search", "nonexistentterm12345"], + capture_output=True, + text=True, + timeout=30, + ) + + # Should complete with no results + assert result.returncode in [0, 1] + + +def test_search_on_empty_archive(tmp_path, process): + """Test search works on empty archive.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "search", "anything"], + capture_output=True, + text=True, + timeout=30, + ) + + # Should complete without error + assert result.returncode in [0, 1] + + +def test_search_json_outputs_matching_snapshots(tmp_path, process, disable_extractors_dict): + """Test that search --json returns parseable matching snapshot rows.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "search", "--json"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert any("example.com" in row.get("url", "") for row in payload) + + +def test_search_json_with_headers_wraps_links_payload(tmp_path, process, disable_extractors_dict): + """Test that search --json --with-headers returns a headers envelope.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "search", "--json", "--with-headers"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + links = payload.get("links", payload) + assert any("example.com" in row.get("url", "") for row in links) + + +def test_search_html_outputs_markup(tmp_path, process, disable_extractors_dict): + """Test that search --html renders an HTML response.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "search", "--html"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + assert "<" in result.stdout + + +def test_search_csv_outputs_requested_column(tmp_path, process, disable_extractors_dict): + """Test that search --csv emits the requested fields.""" + os.chdir(tmp_path) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "search", "--csv", "url", "--with-headers"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + assert "url" in result.stdout + assert "example.com" in result.stdout + + +def test_search_with_headers_requires_structured_output_format(tmp_path, process): + """Test that --with-headers is rejected without --json, --html, or --csv.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "search", "--with-headers"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode != 0 + assert "requires" in result.stderr.lower() or "json" in result.stderr.lower() + + +def test_search_sort_option_runs_successfully(tmp_path, process, disable_extractors_dict): + """Test that search --sort accepts sortable fields.""" + os.chdir(tmp_path) + for url in ["https://iana.org", "https://example.com"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + check=True, + ) + + result = subprocess.run( + ["archivebox", "search", "--csv", "url", "--sort=url"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + assert "example.com" in result.stdout or "iana.org" in result.stdout + + +def test_search_help_lists_supported_filters(tmp_path, process): + """Test that search --help documents the available filters and output modes.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "search", "--help"], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0 + assert "--filter-type" in result.stdout or "-f" in result.stdout + assert "--status" in result.stdout + assert "--sort" in result.stdout diff --git a/archivebox/tests/test_cli_server.py b/archivebox/tests/test_cli_server.py new file mode 100644 index 0000000000..dd7224c227 --- /dev/null +++ b/archivebox/tests/test_cli_server.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox server command. +Verify server can start (basic smoke tests only, no full server testing). +""" + +import os +import subprocess +import sys +from unittest.mock import Mock + + +def test_sqlite_connections_use_explicit_30_second_busy_timeout(): + from archivebox.core.settings import SQLITE_CONNECTION_OPTIONS + + assert SQLITE_CONNECTION_OPTIONS["OPTIONS"]["timeout"] == 30 + assert "PRAGMA busy_timeout = 30000;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"] + + +def test_server_shows_usage_info(tmp_path, process): + """Test that server command shows usage or starts.""" + os.chdir(tmp_path) + + # Just check that the command is recognized + # We won't actually start a full server in tests + result = subprocess.run( + ["archivebox", "server", "--help"], + capture_output=True, + text=True, + timeout=10, + ) + + assert result.returncode == 0 + assert "server" in result.stdout.lower() or "http" in result.stdout.lower() + + +def test_server_init_flag(tmp_path, process): + """Test that --init flag runs init before starting server.""" + os.chdir(tmp_path) + + # Check init flag is recognized + result = subprocess.run( + ["archivebox", "server", "--help"], + capture_output=True, + text=True, + timeout=10, + ) + + assert result.returncode == 0 + assert "--init" in result.stdout or "init" in result.stdout.lower() + + +def test_runner_worker_uses_current_interpreter(): + """The supervised runner should use the active Python environment, not PATH.""" + from archivebox.workers.supervisord_util import RUNNER_WORKER + + assert RUNNER_WORKER["command"] == f"{sys.executable} -m archivebox run --daemon" + + +def test_reload_workers_use_current_interpreter_and_supervisord_managed_runner(): + from archivebox.workers.supervisord_util import RUNNER_WATCH_WORKER, RUNSERVER_WORKER + + runserver = RUNSERVER_WORKER("127.0.0.1", "8000", reload=True, pidfile="/tmp/runserver.pid") + watcher = RUNNER_WATCH_WORKER("/tmp/runserver.pid") + + assert runserver["name"] == "worker_runserver" + assert runserver["command"] == f"{sys.executable} -m archivebox manage runserver 127.0.0.1:8000" + assert 'ARCHIVEBOX_RUNSERVER="1"' in runserver["environment"] + assert 'ARCHIVEBOX_AUTORELOAD="1"' in runserver["environment"] + assert 'ARCHIVEBOX_RUNSERVER_PIDFILE="/tmp/runserver.pid"' in runserver["environment"] + + assert watcher["name"] == "worker_runner_watch" + assert watcher["command"] == f"{sys.executable} -m archivebox manage runner_watch --pidfile=/tmp/runserver.pid" + + +def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators(): + from archivebox.cli.archivebox_server import stop_existing_background_runner + + runner_a = Mock() + runner_a.kill_tree = Mock() + runner_a.terminate = Mock() + runner_b = Mock() + runner_b.kill_tree = Mock(side_effect=RuntimeError("boom")) + runner_b.terminate = Mock() + + process_model = Mock() + process_model.StatusChoices.RUNNING = "running" + process_model.TypeChoices.ORCHESTRATOR = "orchestrator" + queryset = Mock() + queryset.order_by.return_value = [runner_a, runner_b] + process_model.objects.filter.return_value = queryset + + supervisor = Mock() + stop_worker = Mock() + log = Mock() + + stopped = stop_existing_background_runner( + machine=Mock(), + process_model=process_model, + supervisor=supervisor, + stop_worker_fn=stop_worker, + log=log, + ) + + assert stopped == 2 + assert process_model.cleanup_stale_running.call_count == 2 + stop_worker.assert_any_call(supervisor, "worker_runner") + stop_worker.assert_any_call(supervisor, "worker_runner_watch") + runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0) + runner_b.terminate.assert_called_once_with(graceful_timeout=2.0) + log.assert_called_once() + + +def test_stop_existing_server_workers_takes_over_same_runserver_port(monkeypatch): + from archivebox.cli.archivebox_server import stop_existing_server_workers + + supervisor = Mock() + supervisor.getProcessInfo.side_effect = lambda name: { + "worker_runserver": {"statename": "RUNNING"}, + "worker_daphne": {"statename": "STOPPED"}, + }.get(name, None) + stop_worker = Mock() + log = Mock() + + monkeypatch.setattr( + "archivebox.cli.archivebox_server._read_supervisor_worker_command", + lambda worker_name: f"{sys.executable} -m archivebox manage runserver 0.0.0.0:8000" if worker_name == "worker_runserver" else "", + ) + + stopped = stop_existing_server_workers( + supervisor=supervisor, + stop_worker_fn=stop_worker, + host="0.0.0.0", + port="8000", + log=log, + ) + + assert stopped == 1 + stop_worker.assert_called_once_with(supervisor, "worker_runserver") + log.assert_called_once() + + +def test_stop_existing_server_workers_leaves_different_port_running(monkeypatch): + from archivebox.cli.archivebox_server import stop_existing_server_workers + + supervisor = Mock() + supervisor.getProcessInfo.side_effect = lambda name: { + "worker_runserver": {"statename": "RUNNING"}, + "worker_daphne": {"statename": "STOPPED"}, + }.get(name, None) + stop_worker = Mock() + log = Mock() + + monkeypatch.setattr( + "archivebox.cli.archivebox_server._read_supervisor_worker_command", + lambda worker_name: f"{sys.executable} -m archivebox manage runserver 127.0.0.1:9000" if worker_name == "worker_runserver" else "", + ) + + stopped = stop_existing_server_workers( + supervisor=supervisor, + stop_worker_fn=stop_worker, + host="0.0.0.0", + port="8000", + log=log, + ) + + assert stopped == 0 + stop_worker.assert_not_called() + log.assert_not_called() diff --git a/archivebox/tests/test_cli_shell.py b/archivebox/tests/test_cli_shell.py new file mode 100644 index 0000000000..c2a8142c36 --- /dev/null +++ b/archivebox/tests/test_cli_shell.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox shell command. +Verify shell command starts Django shell (basic smoke tests only). +""" + +import os +import subprocess + + +def test_shell_command_exists(tmp_path, process): + """Test that shell command is recognized.""" + os.chdir(tmp_path) + + # Test that the command exists (will fail without input but should recognize command) + result = subprocess.run( + ["archivebox", "shell", "--help"], + capture_output=True, + text=True, + timeout=10, + ) + + # Should show shell help or recognize command + assert result.returncode in [0, 1, 2] + + +def test_shell_c_executes_python(tmp_path, process): + """shell -c should fully initialize Django and run the provided command.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "shell", "-c", 'print("shell-ok")'], + capture_output=True, + text=True, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + assert "shell-ok" in result.stdout diff --git a/archivebox/tests/test_cli_snapshot.py b/archivebox/tests/test_cli_snapshot.py new file mode 100644 index 0000000000..ab821d6ac0 --- /dev/null +++ b/archivebox/tests/test_cli_snapshot.py @@ -0,0 +1,301 @@ +""" +Tests for archivebox snapshot CLI command. + +Tests cover: +- snapshot create (from URLs, from Crawl JSONL, pass-through) +- snapshot list (with filters) +- snapshot update +- snapshot delete +""" + +import json + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, +) + + +class TestSnapshotCreate: + """Tests for `archivebox snapshot create`.""" + + def test_create_from_url_args(self, initialized_archive): + """Create snapshot from URL arguments.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "create", url], + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + assert "Created" in stderr + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]["type"] == "Snapshot" + assert records[0]["url"] == url + + def test_create_from_crawl_jsonl(self, initialized_archive): + """Create snapshots from Crawl JSONL input.""" + url = create_test_url() + + # First create a crawl + stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive) + crawl = parse_jsonl_output(stdout1)[0] + + # Pipe crawl to snapshot create + stdout2, stderr, code = run_archivebox_cmd( + ["snapshot", "create"], + stdin=json.dumps(crawl), + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Crawl passed through and the Snapshot created + types = [r.get("type") for r in records] + assert "Crawl" in types + assert "Snapshot" in types + + snapshot = next(r for r in records if r["type"] == "Snapshot") + assert snapshot["url"] == url + + def test_create_with_tag(self, initialized_archive): + """Create snapshot with --tag flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "create", "--tag=test-tag", url], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert "test-tag" in records[0].get("tags", "") + + def test_create_pass_through_other_types(self, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"} + url = create_test_url() + stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url}) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "create"], + stdin=stdin, + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = [r.get("type") for r in records] + assert "Tag" in types + assert "Snapshot" in types + + def test_create_multiple_urls(self, initialized_archive): + """Create snapshots from multiple URLs.""" + urls = [create_test_url() for _ in range(3)] + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "create"] + urls, + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 3 + + created_urls = {r["url"] for r in records} + for url in urls: + assert url in created_urls + + +class TestSnapshotList: + """Tests for `archivebox snapshot list`.""" + + def test_list_empty(self, initialized_archive): + """List with no snapshots returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list"], + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Listed 0 snapshots" in stderr + + def test_list_returns_created(self, initialized_archive): + """List returns previously created snapshots.""" + url = create_test_url() + run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(r.get("url") == url for r in records) + + def test_list_filter_by_status(self, initialized_archive): + """Filter snapshots by status.""" + url = create_test_url() + run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list", "--status=queued"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_filter_by_url_contains(self, initialized_archive): + """Filter snapshots by URL contains.""" + url = create_test_url(domain="unique-domain-12345.com") + run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list", "--url__icontains=unique-domain-12345"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert "unique-domain-12345" in records[0]["url"] + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + for _ in range(3): + run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list", "--limit=2"], + data_dir=initialized_archive, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + def test_list_with_sort_and_limit(self, initialized_archive): + """Sorting should be applied before limiting.""" + for _ in range(3): + run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list", "--limit=2", "--sort=-created_at"], + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + def test_list_search_meta(self, initialized_archive): + """snapshot list should support metadata search mode.""" + url = create_test_url(domain="meta-search-example.com") + run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "list", "--search=meta", "meta-search-example.com"], + data_dir=initialized_archive, + ) + + assert code == 0, f"Command failed: {stderr}" + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert "meta-search-example.com" in records[0]["url"] + + +class TestSnapshotUpdate: + """Tests for `archivebox snapshot update`.""" + + def test_update_status(self, initialized_archive): + """Update snapshot status.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ["snapshot", "update", "--status=started"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Updated 1 snapshots" in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]["status"] == "started" + + def test_update_add_tag(self, initialized_archive): + """Update snapshot by adding tag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ["snapshot", "update", "--tag=new-tag"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Updated 1 snapshots" in stderr + + +class TestSnapshotDelete: + """Tests for `archivebox snapshot delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "delete"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "delete", "--yes"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Deleted 1 snapshots" in stderr + + def test_delete_dry_run(self, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ["snapshot", "delete", "--dry-run"], + stdin=json.dumps(snapshot), + data_dir=initialized_archive, + ) + + assert code == 0 + assert "Would delete" in stderr diff --git a/archivebox/tests/test_cli_status.py b/archivebox/tests/test_cli_status.py new file mode 100644 index 0000000000..9f77dbeaca --- /dev/null +++ b/archivebox/tests/test_cli_status.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox status command. +Verify status reports accurate collection state from DB and filesystem. +""" + +import os +import sqlite3 +import subprocess +from pathlib import Path + + +def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: + candidates = {snapshot_id} + if len(snapshot_id) == 32: + candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}") + elif len(snapshot_id) == 36 and "-" in snapshot_id: + candidates.add(snapshot_id.replace("-", "")) + + for needle in candidates: + for path in data_dir.rglob(needle): + if path.is_dir(): + return path + return None + + +def test_status_runs_successfully(tmp_path, process): + """Test that status command runs without error.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + assert result.returncode == 0 + assert len(result.stdout) > 100 + + +def test_status_shows_zero_snapshots_in_empty_archive(tmp_path, process): + """Test status shows 0 snapshots in empty archive.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + output = result.stdout + # Should indicate empty/zero state + assert "0" in output + + +def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extractors_dict): + """Test that status shows accurate snapshot count from DB.""" + os.chdir(tmp_path) + + # Add 3 snapshots + for url in ["https://example.com", "https://example.org", "https://example.net"]: + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", url], + capture_output=True, + env=disable_extractors_dict, + ) + + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + # Verify DB has 3 snapshots + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + db_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert db_count == 3 + # Status output should show 3 + assert "3" in result.stdout + + +def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict): + """Test status distinguishes archived vs unarchived snapshots.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + # Should show archived/unarchived categories + assert "archived" in result.stdout.lower() or "queued" in result.stdout.lower() + + +def test_status_shows_archive_directory_size(tmp_path, process): + """Test status reports archive directory size.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + output = result.stdout + # Should show size info + assert "Size" in output or "size" in output + + +def test_status_counts_archive_directories(tmp_path, process, disable_extractors_dict): + """Test status counts directories in archive/ folder.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + # Should show directory count + assert "present" in result.stdout.lower() or "directories" in result.stdout + + +def test_status_detects_orphaned_directories(tmp_path, process, disable_extractors_dict): + """Test status detects directories not in DB (orphaned).""" + os.chdir(tmp_path) + + # Add a snapshot + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Create an orphaned directory + (tmp_path / "archive" / "fake_orphaned_dir").mkdir(parents=True, exist_ok=True) + + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + # Should mention orphaned dirs + assert "orphan" in result.stdout.lower() or "1" in result.stdout + + +def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, disable_extractors_dict): + """Test status reads archived/present counts from the current snapshot output layout.""" + os.chdir(tmp_path) + env = disable_extractors_dict.copy() + env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true" + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=env, + check=True, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ("https://example.com",)).fetchone()[0] + conn.close() + + snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id)) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + title_dir = snapshot_dir / "title" + title_dir.mkdir(parents=True, exist_ok=True) + (title_dir / "title.txt").write_text("Example Domain") + + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True, env=env) + + assert result.returncode == 0, result.stdout + result.stderr + assert "archived: 1" in result.stdout + assert "present: 1" in result.stdout + + +def test_status_shows_user_info(tmp_path, process): + """Test status shows user/login information.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + output = result.stdout + # Should show user section + assert "user" in output.lower() or "login" in output.lower() + + +def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extractors_dict): + """Test that status uses DB as source of truth, not filesystem.""" + os.chdir(tmp_path) + + # Add snapshot to DB + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Verify DB has snapshot + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + db_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert db_count == 1 + + # Status should reflect DB count + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + assert "1" in result.stdout + + +def test_status_shows_index_file_info(tmp_path, process): + """Test status shows index file information.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + # Should mention index + assert "index" in result.stdout.lower() or "Index" in result.stdout + + +def test_status_help_lists_available_options(tmp_path, process): + """Test that status --help works and documents the command.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "status", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "status" in result.stdout.lower() or "statistic" in result.stdout.lower() + + +def test_status_shows_data_directory_path(tmp_path, process): + """Test that status reports which collection directory it is inspecting.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "status"], capture_output=True, text=True) + + assert "archive" in result.stdout.lower() or str(tmp_path) in result.stdout diff --git a/archivebox/tests/test_cli_update.py b/archivebox/tests/test_cli_update.py new file mode 100644 index 0000000000..e4235bf4bb --- /dev/null +++ b/archivebox/tests/test_cli_update.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox update command. +Verify update drains old dirs, reconciles DB, and queues snapshots. +""" + +import os +import sqlite3 +import subprocess + + +def test_update_runs_successfully_on_empty_archive(tmp_path, process): + """Test that update runs without error on empty archive.""" + os.chdir(tmp_path) + result = subprocess.run( + ["archivebox", "update"], + capture_output=True, + text=True, + timeout=30, + ) + + # Should complete successfully even with no snapshots + assert result.returncode == 0 + + +def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict): + """Test that update command reconciles existing snapshots.""" + os.chdir(tmp_path) + + # Add a snapshot (index-only for faster test) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + ) + + # Run update - should reconcile and queue + result = subprocess.run( + ["archivebox", "update"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + assert result.returncode == 0 + + +def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractors_dict): + """Test updating specific snapshot using filter.""" + os.chdir(tmp_path) + + # Add multiple snapshots + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + timeout=90, + ) + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.org"], + capture_output=True, + env=disable_extractors_dict, + timeout=90, + ) + + # Update with filter pattern (uses filter_patterns argument) + result = subprocess.run( + ["archivebox", "update", "--filter-type=substring", "example.com"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Should complete successfully + assert result.returncode == 0 + + +def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict): + """Test that update doesn't change snapshot count.""" + os.chdir(tmp_path) + + # Add snapshots + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + timeout=90, + ) + + # Count before update + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + assert count_before == 1 + + # Run update (should reconcile + queue, not create new snapshots) + subprocess.run( + ["archivebox", "update"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Count after update + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] + conn.close() + + # Snapshot count should remain the same + assert count_after == count_before + + +def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict): + """Test that update queues snapshots for archiving.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"], + capture_output=True, + env=disable_extractors_dict, + timeout=90, + ) + + # Run update + result = subprocess.run( + ["archivebox", "update"], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + assert result.returncode == 0 + + # Check that snapshot is queued + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + status = c.execute("SELECT status FROM core_snapshot").fetchone()[0] + conn.close() + + assert status == "queued" diff --git a/archivebox/tests/test_cli_version.py b/archivebox/tests/test_cli_version.py new file mode 100644 index 0000000000..0d524004e4 --- /dev/null +++ b/archivebox/tests/test_cli_version.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox version command. +Verify version output and system information reporting. +""" + +import os +import re +import sys +import tempfile +import subprocess +from pathlib import Path + +from .fixtures import process + +FIXTURES = (process,) + + +def _archivebox_cli() -> str: + cli = Path(sys.executable).with_name("archivebox") + return str(cli if cli.exists() else "archivebox") + + +def _run_real_cli( + args: list[str], + cwd: Path, + *, + home_dir: Path, + timeout: int = 180, + extra_env: dict[str, str] | None = None, +) -> subprocess.CompletedProcess[str]: + env = os.environ.copy() + env.pop("DATA_DIR", None) + env["HOME"] = str(home_dir) + env["USE_COLOR"] = "False" + env["SHOW_PROGRESS"] = "False" + if extra_env: + env.update(extra_env) + return subprocess.run( + [_archivebox_cli(), *args], + capture_output=True, + text=True, + cwd=cwd, + env=env, + timeout=timeout, + ) + + +def _make_deep_collection_dir(tmp_path: Path) -> Path: + deep_dir = tmp_path / "deep-collection" + for idx in range(6): + deep_dir /= f"segment-{idx}-1234567890abcdef" + deep_dir.mkdir(parents=True) + return deep_dir + + +def _extract_location_path(output: str, key: str) -> Path: + for line in output.splitlines(): + if key not in line: + continue + columns = [column for column in re.split(r"\s{2,}", line.strip()) if column] + if len(columns) >= 5 and columns[1] == key: + return Path(os.path.expanduser(columns[-1])) + raise AssertionError(f"Did not find a {key} location line in output:\n{output}") + + +def test_version_quiet_outputs_version_number(tmp_path): + """Test that version --quiet outputs just the version number.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True) + + assert result.returncode == 0 + version = result.stdout.strip() + assert version + # Version should be semver-ish format (e.g., 0.8.0) + parts = version.split(".") + assert len(parts) >= 2 + + +def test_version_flag_outputs_version_number(tmp_path): + """Test that top-level --version reports the package version.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "--version"], capture_output=True, text=True) + + assert result.returncode == 0 + version = result.stdout.strip() + assert version + assert len(version.split(".")) >= 2 + + +def test_version_shows_system_info_in_initialized_dir(tmp_path, process): + """Test that version shows system metadata in initialized directory.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version"], capture_output=True, text=True) + + output = result.stdout + assert "ArchiveBox" in output + # Should show system info + assert any(x in output for x in ["ARCH=", "OS=", "PYTHON="]) + + +def test_version_shows_binaries_after_init(tmp_path, process): + """Test that version shows binary dependencies in initialized directory.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version"], capture_output=True, text=True) + + output = result.stdout + # Should show binary section + assert "Binary" in output or "Dependencies" in output + + +def test_version_shows_data_locations(tmp_path, process): + """Test that version shows data directory locations.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version"], capture_output=True, text=True) + + output = result.stdout + # Should show paths + assert any(x in output for x in ["Data", "Code", "location"]) + + +def test_version_in_uninitialized_dir_still_works(tmp_path): + """Test that version command works even without initialized data dir.""" + empty_dir = tmp_path / "empty" + empty_dir.mkdir() + os.chdir(empty_dir) + + result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True) + + # Should still output version + assert result.returncode == 0 + assert len(result.stdout.strip()) > 0 + + +def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path): + """Test the real CLI init/version flow auto-selects a short TMP_DIR outside deep collections.""" + data_dir = _make_deep_collection_dir(tmp_path) + default_tmp_dir = data_dir / "tmp" + extra_env = {"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true"} + + with tempfile.TemporaryDirectory(prefix="abx-home-") as home_tmp: + home_dir = Path(home_tmp) + + init_result = _run_real_cli(["init", "--quick"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env) + assert init_result.returncode == 0, init_result.stdout + init_result.stderr + + version_result = _run_real_cli(["version"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env) + output = version_result.stdout + version_result.stderr + + assert version_result.returncode == 0, output + assert "ArchiveBox" in output + assert "TMP_DIR" in output + assert "Error with configured TMP_DIR" not in output + + reported_tmp_dir = _extract_location_path(output, "TMP_DIR") + if not reported_tmp_dir.is_absolute(): + reported_tmp_dir = (data_dir / reported_tmp_dir).resolve() + + assert reported_tmp_dir.exists() + assert not reported_tmp_dir.is_relative_to(default_tmp_dir) + assert len(f"file://{reported_tmp_dir / 'supervisord.sock'}") <= 96 + + +def test_version_help_lists_quiet_flag(tmp_path): + """Test that version --help documents the quiet output mode.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version", "--help"], capture_output=True, text=True) + + assert result.returncode == 0 + assert "--quiet" in result.stdout or "-q" in result.stdout + + +def test_version_invalid_option_fails(tmp_path): + """Test that invalid version options fail cleanly.""" + os.chdir(tmp_path) + result = subprocess.run(["archivebox", "version", "--invalid-option"], capture_output=True, text=True) + + assert result.returncode != 0 diff --git a/archivebox/tests/test_config.py b/archivebox/tests/test_config.py new file mode 100644 index 0000000000..ab9a557e1e --- /dev/null +++ b/archivebox/tests/test_config.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +"""Integration tests for archivebox config command.""" + +import os +import subprocess + +import pytest + + +def test_config_shows_all_config_values(tmp_path, process): + """Test that config without args shows all config values.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config"], + capture_output=True, + text=True, + ) + + # Should show various config sections + assert "TIMEOUT" in result.stdout or "timeout" in result.stdout.lower() + # Config should show some output + assert len(result.stdout) > 100 + + +def test_config_get_specific_key(tmp_path, process): + """Test that --get retrieves a specific config value.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--get", "TIMEOUT"], + capture_output=True, + text=True, + ) + + # Should show the TIMEOUT value + assert "TIMEOUT" in result.stdout or result.returncode == 0 + + +def test_config_set_value_writes_to_config_file(tmp_path, process): + """Test that --set writes config value to ArchiveBox.conf file.""" + os.chdir(tmp_path) + + # Set a config value + result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=120"], + capture_output=True, + text=True, + ) + assert result.returncode == 0, result.stderr + + # Read the config file directly to verify it was written + config_file = tmp_path / "ArchiveBox.conf" + if config_file.exists(): + config_content = config_file.read_text() + # Config should contain the set value + assert "TIMEOUT" in config_content or "timeout" in config_content.lower() + + +def test_config_set_and_get_roundtrip(tmp_path, process): + """Test that a value set with --set can be retrieved with --get.""" + os.chdir(tmp_path) + + # Set a value + set_result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT=999"], + capture_output=True, + text=True, + ) + + # Verify set was successful + assert set_result.returncode == 0 or "999" in set_result.stdout + + # Read the config file directly to verify + config_file = tmp_path / "ArchiveBox.conf" + if config_file.exists(): + config_content = config_file.read_text() + assert "999" in config_content or "TIMEOUT" in config_content + + +def test_config_search_finds_matching_keys(tmp_path, process): + """Test that --search finds config keys matching a pattern.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--search", "TIMEOUT"], + capture_output=True, + text=True, + ) + + # Should find TIMEOUT-related config + assert "TIMEOUT" in result.stdout or result.returncode == 0 + + +def test_config_invalid_key_fails(tmp_path, process): + """Test that setting an invalid config key fails.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "INVALID_KEY_THAT_DOES_NOT_EXIST=value"], + capture_output=True, + text=True, + ) + + # Should fail + assert result.returncode != 0 or "failed" in result.stdout.lower() + + +def test_config_set_requires_equals_sign(tmp_path, process): + """Test that --set requires KEY=VALUE format.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--set", "TIMEOUT"], + capture_output=True, + text=True, + ) + + # Should fail because there's no = sign + assert result.returncode != 0 + + +class TestConfigCLI: + """Test the CLI interface for config command.""" + + def test_cli_help(self, tmp_path, process): + """Test that --help works for config command.""" + os.chdir(tmp_path) + + result = subprocess.run( + ["archivebox", "config", "--help"], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert "--get" in result.stdout + assert "--set" in result.stdout + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/archivebox/tests/test_config_views.py b/archivebox/tests/test_config_views.py new file mode 100644 index 0000000000..632258b04f --- /dev/null +++ b/archivebox/tests/test_config_views.py @@ -0,0 +1,339 @@ +from datetime import timedelta +from types import SimpleNamespace + +import pytest +from django.test import RequestFactory +from django.utils import timezone + +from archivebox.config import views as config_views +from archivebox.core import views as core_views +from archivebox.machine.models import Binary + + +pytestmark = pytest.mark.django_db + + +def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch): + now = timezone.now() + records = [ + SimpleNamespace( + name="youtube-dl", + version="", + binprovider="", + abspath="/usr/bin/youtube-dl", + status=Binary.StatusChoices.INSTALLED, + modified_at=now, + ), + SimpleNamespace( + name="yt-dlp", + version="2026.03.01", + binprovider="pip", + abspath="/usr/bin/yt-dlp", + status=Binary.StatusChoices.INSTALLED, + modified_at=now + timedelta(seconds=1), + ), + ] + + monkeypatch.setattr(config_views.Binary, "objects", SimpleNamespace(all=lambda: records)) + + binaries = config_views.get_db_binaries_by_name() + + assert "yt-dlp" in binaries + assert "youtube-dl" not in binaries + assert binaries["yt-dlp"].version == "2026.03.01" + + +def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch): + request = RequestFactory().get("/admin/environment/binaries/") + request.user = SimpleNamespace(is_superuser=True) + + db_binary = SimpleNamespace( + name="youtube-dl", + version="2026.03.01", + binprovider="pip", + abspath="/usr/bin/yt-dlp", + status=Binary.StatusChoices.INSTALLED, + sha256="", + modified_at=timezone.now(), + ) + + monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary}) + + context = config_views.binaries_list_view.__wrapped__(request) + + assert len(context["table"]["Binary Name"]) == 1 + assert str(context["table"]["Binary Name"][0].link_item) == "yt-dlp" + assert context["table"]["Found Version"][0] == "✅ 2026.03.01" + assert context["table"]["Provided By"][0] == "pip" + assert context["table"]["Found Abspath"][0] == "/usr/bin/yt-dlp" + + +def test_binaries_list_view_only_shows_persisted_records(monkeypatch): + request = RequestFactory().get("/admin/environment/binaries/") + request.user = SimpleNamespace(is_superuser=True) + + monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {}) + + context = config_views.binaries_list_view.__wrapped__(request) + + assert context["table"]["Binary Name"] == [] + assert context["table"]["Found Version"] == [] + assert context["table"]["Provided By"] == [] + assert context["table"]["Found Abspath"] == [] + + +def test_binary_detail_view_uses_canonical_db_record(monkeypatch): + request = RequestFactory().get("/admin/environment/binaries/youtube-dl/") + request.user = SimpleNamespace(is_superuser=True) + + db_binary = SimpleNamespace( + id="019d14cc-6c40-7793-8ff1-0f8bb050e8a3", + name="yt-dlp", + version="2026.03.01", + binprovider="pip", + abspath="/usr/bin/yt-dlp", + sha256="abc123", + status=Binary.StatusChoices.INSTALLED, + modified_at=timezone.now(), + ) + + monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary}) + + context = config_views.binary_detail_view.__wrapped__(request, key="youtube-dl") + section = context["data"][0] + + assert context["title"] == "yt-dlp" + assert section["fields"]["name"] == "yt-dlp" + assert section["fields"]["version"] == "2026.03.01" + assert section["fields"]["binprovider"] == "pip" + assert section["fields"]["abspath"] == "/usr/bin/yt-dlp" + assert "/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp" in section["description"] + + +def test_binary_detail_view_marks_unrecorded_binary(monkeypatch): + request = RequestFactory().get("/admin/environment/binaries/wget/") + request.user = SimpleNamespace(is_superuser=True) + + monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {}) + + context = config_views.binary_detail_view.__wrapped__(request, key="wget") + section = context["data"][0] + + assert section["description"] == "No persisted Binary record found" + assert section["fields"]["status"] == "unrecorded" + assert section["fields"]["binprovider"] == "not recorded" + + +def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch): + request = RequestFactory().get("/admin/environment/plugins/builtin.example/") + request.user = SimpleNamespace(is_superuser=True) + + plugin_config = { + "title": "Example Plugin", + "description": "Example config used to verify plugin metadata rendering.", + "type": "object", + "required_plugins": ["chrome"], + "required_binaries": [ + { + "name": "example-cli", + "binproviders": "env,apt,brew", + "min_version": None, + }, + ], + "output_mimetypes": ["text/plain", "application/json"], + "properties": { + "EXAMPLE_ENABLED": { + "type": "boolean", + "description": "Enable the example plugin.", + "x-fallback": "CHECK_SSL_VALIDITY", + }, + "EXAMPLE_BINARY": { + "type": "string", + "default": "gallery-dl", + "description": "Filesystem path for example output.", + "x-aliases": ["USE_EXAMPLE_BINARY"], + }, + }, + } + + monkeypatch.setattr( + config_views, + "get_filesystem_plugins", + lambda: { + "builtin.example": { + "id": "builtin.example", + "name": "example", + "source": "builtin", + "path": "/plugins/example", + "hooks": ["on_Snapshot__01_example.py"], + "config": plugin_config, + }, + }, + ) + monkeypatch.setattr(config_views, "get_machine_admin_url", lambda: "/admin/machine/machine/test-machine/change/") + + context = config_views.plugin_detail_view.__wrapped__(request, key="builtin.example") + + assert context["title"] == "example" + assert len(context["data"]) == 5 + + summary_section, hooks_section, metadata_section, config_section, properties_section = context["data"] + + assert summary_section["fields"] == { + "id": "builtin.example", + "name": "example", + "source": "builtin", + } + assert "/plugins/example" in summary_section["description"] + assert "https://archivebox.github.io/abx-plugins/#example" in summary_section["description"] + + assert hooks_section["name"] == "Hooks" + assert hooks_section["fields"] == {} + assert ( + "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py" + in hooks_section["description"] + ) + assert "on_Snapshot__01_example.py" in hooks_section["description"] + + assert metadata_section["name"] == "Plugin Metadata" + assert metadata_section["fields"] == {} + assert "Example Plugin" in metadata_section["description"] + assert "Example config used to verify plugin metadata rendering." in metadata_section["description"] + assert "https://archivebox.github.io/abx-plugins/#chrome" in metadata_section["description"] + assert "/admin/environment/binaries/example-cli/" in metadata_section["description"] + assert "text/plain" in metadata_section["description"] + assert "application/json" in metadata_section["description"] + + assert config_section["name"] == "config.json" + assert config_section["fields"] == {} + assert "
    "properties"' in config_section["description"]
    +
    +    assert properties_section["name"] == "Config Properties"
    +    assert properties_section["fields"] == {}
    +    assert "/admin/machine/machine/test-machine/change/" in properties_section["description"]
    +    assert "/admin/machine/binary/" in properties_section["description"]
    +    assert "/admin/environment/binaries/" in properties_section["description"]
    +    assert "EXAMPLE_ENABLED" in properties_section["description"]
    +    assert "boolean" in properties_section["description"]
    +    assert "Enable the example plugin." in properties_section["description"]
    +    assert "/admin/environment/config/EXAMPLE_ENABLED/" in properties_section["description"]
    +    assert "/admin/environment/config/CHECK_SSL_VALIDITY/" in properties_section["description"]
    +    assert "/admin/environment/config/USE_EXAMPLE_BINARY/" in properties_section["description"]
    +    assert "/admin/environment/binaries/gallery-dl/" in properties_section["description"]
    +    assert "EXAMPLE_BINARY" in properties_section["description"]
    +
    +
    +def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
    +    monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: None)
    +
    +    url, label = core_views.get_config_definition_link("CHECK_SSL_VALIDITY")
    +
    +    assert "github.com/search" in url
    +    assert "CHECK_SSL_VALIDITY" in url
    +    assert label == "archivebox/config"
    +
    +
    +def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
    +    plugin_dir = core_views.BUILTIN_PLUGINS_DIR / "parse_dom_outlinks"
    +
    +    monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: "parse_dom_outlinks")
    +    monkeypatch.setattr(core_views, "iter_plugin_dirs", lambda: [plugin_dir])
    +
    +    url, label = core_views.get_config_definition_link("PARSE_DOM_OUTLINKS_ENABLED")
    +
    +    assert url == "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json"
    +    assert label == "abx_plugins/plugins/parse_dom_outlinks/config.json"
    +
    +
    +def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
    +    request = RequestFactory().get("/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/")
    +    request.user = SimpleNamespace(is_superuser=True)
    +
    +    monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
    +    monkeypatch.setattr(core_views, "get_flat_config", lambda: {})
    +    monkeypatch.setattr(core_views, "get_config", lambda: {"PARSE_DOM_OUTLINKS_ENABLED": True})
    +    monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
    +    monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
    +    monkeypatch.setattr(core_views, "find_config_source", lambda key, merged: "Default")
    +    monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
    +    monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: False))
    +
    +    from archivebox.machine.models import Machine
    +    from archivebox.config.configset import BaseConfigSet
    +
    +    monkeypatch.setattr(Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-id", config={})))
    +    monkeypatch.setattr(BaseConfigSet, "load_from_file", classmethod(lambda cls, path: {}))
    +    monkeypatch.setattr(
    +        core_views,
    +        "get_config_definition_link",
    +        lambda key: (
    +            "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json",
    +            "abx_plugins/plugins/parse_dom_outlinks/config.json",
    +        ),
    +    )
    +
    +    context = core_views.live_config_value_view.__wrapped__(request, key="PARSE_DOM_OUTLINKS_ENABLED")
    +    section = context["data"][0]
    +
    +    assert "Currently read from" in section["fields"]
    +    assert "Source" not in section["fields"]
    +    assert section["fields"]["Currently read from"] == "Default"
    +    assert "abx_plugins/plugins/parse_dom_outlinks/config.json" in section["help_texts"]["Type"]
    +
    +
    +def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
    +    monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
    +
    +    from archivebox.machine.models import Machine
    +    from archivebox.config.configset import BaseConfigSet
    +
    +    monkeypatch.setattr(
    +        Machine,
    +        "current",
    +        classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
    +    )
    +    monkeypatch.setattr(
    +        BaseConfigSet,
    +        "load_from_file",
    +        classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
    +    )
    +
    +    assert core_views.find_config_source("CHECK_SSL_VALIDITY", {"CHECK_SSL_VALIDITY": False}) == "Environment"
    +
    +
    +def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
    +    request = RequestFactory().get("/admin/environment/config/CHECK_SSL_VALIDITY/")
    +    request.user = SimpleNamespace(is_superuser=True)
    +
    +    monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
    +    monkeypatch.setattr(core_views, "get_flat_config", lambda: {"CHECK_SSL_VALIDITY": True})
    +    monkeypatch.setattr(core_views, "get_config", lambda: {"CHECK_SSL_VALIDITY": False})
    +    monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
    +    monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
    +    monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
    +
    +    from archivebox.machine.models import Machine
    +    from archivebox.config.configset import BaseConfigSet
    +
    +    monkeypatch.setattr(
    +        Machine,
    +        "current",
    +        classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
    +    )
    +    monkeypatch.setattr(
    +        BaseConfigSet,
    +        "load_from_file",
    +        classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
    +    )
    +    monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: True))
    +    monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
    +
    +    context = core_views.live_config_value_view.__wrapped__(request, key="CHECK_SSL_VALIDITY")
    +    section = context["data"][0]
    +
    +    assert section["fields"]["Currently read from"] == "Environment"
    +    help_text = section["help_texts"]["Currently read from"]
    +    assert help_text.index("Environment") < help_text.index("Machine") < help_text.index("Config File") < help_text.index("Default")
    +    assert "Configuration Sources (highest priority first):" in section["help_texts"]["Value"]
    diff --git a/archivebox/tests/test_crawl.py b/archivebox/tests/test_crawl.py
    new file mode 100644
    index 0000000000..e1c1a7462d
    --- /dev/null
    +++ b/archivebox/tests/test_crawl.py
    @@ -0,0 +1,191 @@
    +#!/usr/bin/env python3
    +"""Integration tests for archivebox crawl command."""
    +
    +import os
    +import subprocess
    +import sqlite3
    +
    +import pytest
    +
    +
    +def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
    +    """Test that crawl command creates a Crawl object."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--no-wait", "https://example.com"],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    +    conn.close()
    +
    +    assert crawl is not None, "Crawl object should be created"
    +
    +
    +def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extractors_dict):
    +    """Test that --depth option sets max_depth in the Crawl object."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--depth=2", "--no-wait", "https://example.com"],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    +    conn.close()
    +
    +    assert crawl is not None
    +    assert crawl[0] == 2, "Crawl max_depth should match --depth=2"
    +
    +
    +def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_dict):
    +    """Test that crawl creates a Snapshot for the input URL."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--no-wait", "https://example.com"],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    snapshot = c.execute(
    +        "SELECT url FROM core_snapshot WHERE url = ?",
    +        ("https://example.com",),
    +    ).fetchone()
    +    conn.close()
    +
    +    assert snapshot is not None, "Snapshot should be created for input URL"
    +
    +
    +def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
    +    """Test that Snapshot is linked to Crawl via crawl_id."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--no-wait", "https://example.com"],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    # Get the crawl ID
    +    crawl = c.execute("SELECT id FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    +    assert crawl is not None
    +    crawl_id = crawl[0]
    +
    +    # Check snapshot has correct crawl_id
    +    snapshot = c.execute(
    +        "SELECT crawl_id FROM core_snapshot WHERE url = ?",
    +        ("https://example.com",),
    +    ).fetchone()
    +    conn.close()
    +
    +    assert snapshot is not None
    +    assert snapshot[0] == crawl_id, "Snapshot should be linked to Crawl"
    +
    +
    +def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disable_extractors_dict):
    +    """Test that crawling multiple URLs creates multiple snapshots."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        [
    +            "archivebox",
    +            "crawl",
    +            "--no-wait",
    +            "https://example.com",
    +            "https://iana.org",
    +        ],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
    +    conn.close()
    +
    +    urls = [u[0] for u in urls]
    +    assert "https://example.com" in urls
    +    assert "https://iana.org" in urls
    +
    +
    +def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
    +    """Test that crawl can create snapshots from a file of URLs."""
    +    os.chdir(tmp_path)
    +
    +    # Write URLs to a file
    +    urls_file = tmp_path / "urls.txt"
    +    urls_file.write_text("https://example.com\n")
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--no-wait", str(urls_file)],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
    +    conn.close()
    +
    +    # Should create at least one snapshot (the source file or the URL)
    +    assert snapshot is not None, "Should create at least one snapshot"
    +
    +
    +def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
    +    """Test that crawl input URLs are stored on the Crawl record."""
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "crawl", "--no-wait", "https://example.com"],
    +        capture_output=True,
    +        text=True,
    +        env=disable_extractors_dict,
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +    crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    +    conn.close()
    +
    +    assert crawl_urls is not None, "Crawl should be created for crawl input"
    +    assert "https://example.com" in crawl_urls[0], "Crawl should persist input URLs"
    +
    +
    +class TestCrawlCLI:
    +    """Test the CLI interface for crawl command."""
    +
    +    def test_cli_help(self, tmp_path, process):
    +        """Test that --help works for crawl command."""
    +        os.chdir(tmp_path)
    +
    +        result = subprocess.run(
    +            ["archivebox", "crawl", "--help"],
    +            capture_output=True,
    +            text=True,
    +        )
    +
    +        assert result.returncode == 0
    +        assert "create" in result.stdout
    +
    +
    +if __name__ == "__main__":
    +    pytest.main([__file__, "-v"])
    diff --git a/archivebox/tests/test_crawl_admin.py b/archivebox/tests/test_crawl_admin.py
    new file mode 100644
    index 0000000000..62d666f443
    --- /dev/null
    +++ b/archivebox/tests/test_crawl_admin.py
    @@ -0,0 +1,232 @@
    +from typing import cast
    +
    +import pytest
    +from django.contrib.auth import get_user_model
    +from django.contrib.auth.models import UserManager
    +from django.urls import reverse
    +
    +from archivebox.crawls.admin import CrawlAdminForm
    +from archivebox.crawls.models import Crawl
    +from archivebox.core.models import Snapshot
    +
    +
    +pytestmark = pytest.mark.django_db
    +
    +
    +User = get_user_model()
    +ADMIN_HOST = "admin.archivebox.localhost:8000"
    +
    +
    +@pytest.fixture
    +def admin_user(db):
    +    return cast(UserManager, User.objects).create_superuser(
    +        username="crawladmin",
    +        email="crawladmin@test.com",
    +        password="testpassword",
    +    )
    +
    +
    +@pytest.fixture
    +def crawl(admin_user):
    +    return Crawl.objects.create(
    +        urls="https://example.com\nhttps://example.org",
    +        tags_str="alpha,beta",
    +        created_by=admin_user,
    +    )
    +
    +
    +def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
    +    client.login(username="crawladmin", password="testpassword")
    +
    +    response = client.get(
    +        reverse("admin:crawls_crawl_change", args=[crawl.pk]),
    +        HTTP_HOST=ADMIN_HOST,
    +    )
    +
    +    assert response.status_code == 200
    +    assert b'name="tags_editor"' in response.content
    +    assert b"tag-editor-container" in response.content
    +    assert b"alpha" in response.content
    +    assert b"beta" in response.content
    +
    +
    +def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
    +    client.login(username="crawladmin", password="testpassword")
    +
    +    response = client.get(
    +        reverse("admin:crawls_crawl_add"),
    +        HTTP_HOST=ADMIN_HOST,
    +    )
    +
    +    assert response.status_code == 200
    +    assert b'name="url_filters_allowlist"' in response.content
    +    assert b'name="url_filters_denylist"' in response.content
    +    assert b"Same domain only" in response.content
    +
    +
    +def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
    +    form = CrawlAdminForm(
    +        data={
    +            "created_at": crawl.created_at.strftime("%Y-%m-%d %H:%M:%S"),
    +            "urls": crawl.urls,
    +            "config": "{}",
    +            "max_depth": "0",
    +            "max_urls": "3",
    +            "max_size": str(45 * 1024 * 1024),
    +            "tags_editor": "alpha, beta, Alpha, gamma",
    +            "url_filters_allowlist": "example.com\n*.example.com",
    +            "url_filters_denylist": "static.example.com",
    +            "persona_id": "",
    +            "label": "",
    +            "notes": "",
    +            "schedule": "",
    +            "status": crawl.status,
    +            "retry_at": crawl.retry_at.strftime("%Y-%m-%d %H:%M:%S"),
    +            "created_by": str(admin_user.pk),
    +            "num_uses_failed": "0",
    +            "num_uses_succeeded": "0",
    +        },
    +        instance=crawl,
    +    )
    +
    +    assert form.is_valid(), form.errors
    +
    +    updated = form.save()
    +    updated.refresh_from_db()
    +    assert updated.tags_str == "alpha,beta,gamma"
    +    assert updated.max_urls == 3
    +    assert updated.max_size == 45 * 1024 * 1024
    +    assert updated.config["MAX_URLS"] == 3
    +    assert updated.config["MAX_SIZE"] == 45 * 1024 * 1024
    +    assert updated.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
    +    assert updated.config["URL_DENYLIST"] == "static.example.com"
    +
    +
    +def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com/remove-me",
    +        created_by=admin_user,
    +    )
    +    snapshot = Snapshot.objects.create(
    +        crawl=crawl,
    +        url="https://example.com/remove-me",
    +    )
    +
    +    client.login(username="crawladmin", password="testpassword")
    +    response = client.post(
    +        reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]),
    +        HTTP_HOST=ADMIN_HOST,
    +    )
    +
    +    assert response.status_code == 200
    +    assert response.json()["ok"] is True
    +    assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
    +
    +    crawl.refresh_from_db()
    +    assert "https://example.com/remove-me" not in crawl.urls
    +
    +
    +def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
    +    crawl = Crawl.objects.create(
    +        urls="\n".join(
    +            [
    +                "https://cdn.example.com/asset.js",
    +                "https://cdn.example.com/second.js",
    +                "https://example.com/root",
    +            ],
    +        ),
    +        created_by=admin_user,
    +    )
    +    queued_snapshot = Snapshot.objects.create(
    +        crawl=crawl,
    +        url="https://cdn.example.com/asset.js",
    +        status=Snapshot.StatusChoices.QUEUED,
    +    )
    +    preserved_snapshot = Snapshot.objects.create(
    +        crawl=crawl,
    +        url="https://example.com/root",
    +        status=Snapshot.StatusChoices.SEALED,
    +    )
    +
    +    client.login(username="crawladmin", password="testpassword")
    +    response = client.post(
    +        reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, queued_snapshot.pk]),
    +        HTTP_HOST=ADMIN_HOST,
    +    )
    +
    +    assert response.status_code == 200
    +    payload = response.json()
    +    assert payload["ok"] is True
    +    assert payload["domain"] == "cdn.example.com"
    +
    +    crawl.refresh_from_db()
    +    assert crawl.get_url_denylist(use_effective_config=False) == ["cdn.example.com"]
    +    assert "https://cdn.example.com/asset.js" not in crawl.urls
    +    assert "https://cdn.example.com/second.js" not in crawl.urls
    +    assert "https://example.com/root" in crawl.urls
    +    assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
    +    assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
    +
    +
    +def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
    +    snapshot = Snapshot.from_json(
    +        {"url": "https://docs.sweeting.me/s/youtube-favorites)**"},
    +        overrides={"crawl": crawl},
    +        queue_for_extraction=False,
    +    )
    +
    +    assert snapshot is not None
    +    assert snapshot.url == "https://docs.sweeting.me/s/youtube-favorites"
    +
    +
    +def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
    +    crawl = Crawl.objects.create(
    +        urls="\n".join(
    +            [
    +                "https://example.com/root",
    +                "https://static.example.com/app.js",
    +                "https://other.test/page",
    +            ],
    +        ),
    +        created_by=admin_user,
    +        config={
    +            "URL_ALLOWLIST": "example.com",
    +            "URL_DENYLIST": "static.example.com",
    +        },
    +    )
    +
    +    created = crawl.create_snapshots_from_urls()
    +
    +    assert [snapshot.url for snapshot in created] == ["https://example.com/root"]
    +
    +
    +def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
    +    crawl = Crawl.objects.create(
    +        urls="\n".join(
    +            [
    +                "https://example.com/root",
    +                "https://example.com/path,with,commas",
    +                "https://other.test/page",
    +            ],
    +        ),
    +        created_by=admin_user,
    +        config={
    +            "URL_ALLOWLIST": r"^https://example\.com/(root|path,with,commas)$" + "\n" + r"^https://other\.test/page$",
    +            "URL_DENYLIST": r"^https://example\.com/path,with,commas$",
    +        },
    +    )
    +
    +    assert crawl.get_url_allowlist(use_effective_config=False) == [
    +        r"^https://example\.com/(root|path,with,commas)$",
    +        r"^https://other\.test/page$",
    +    ]
    +    assert crawl.get_url_denylist(use_effective_config=False) == [
    +        r"^https://example\.com/path,with,commas$",
    +    ]
    +
    +    created = crawl.create_snapshots_from_urls()
    +
    +    assert [snapshot.url for snapshot in created] == [
    +        "https://example.com/root",
    +        "https://other.test/page",
    +    ]
    diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py
    new file mode 100755
    index 0000000000..e00126e6d9
    --- /dev/null
    +++ b/archivebox/tests/test_hooks.py
    @@ -0,0 +1,665 @@
    +#!/usr/bin/env python3
    +"""
    +Unit tests for the ArchiveBox hook architecture.
    +
    +Tests hook discovery, execution, JSONL parsing, background hook detection,
    +binary lookup, and required_binaries XYZ_BINARY passthrough handling.
    +
    +Run with:
    +    sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
    +"""
    +
    +import json
    +import os
    +import shutil
    +import subprocess
    +import tempfile
    +import unittest
    +from pathlib import Path
    +from unittest.mock import patch
    +
    +# Set up Django before importing any Django-dependent modules
    +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
    +
    +
    +class TestBackgroundHookDetection(unittest.TestCase):
    +    """Test that background hooks are detected by .bg. suffix."""
    +
    +    def test_bg_js_suffix_detected(self):
    +        """Hooks with .bg.js suffix should be detected as background."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertTrue(is_background_hook("on_Snapshot__21_consolelog.daemon.bg.js"))
    +
    +    def test_bg_py_suffix_detected(self):
    +        """Hooks with .bg.py suffix should be detected as background."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertTrue(is_background_hook("on_Snapshot__24_responses.finite.bg.py"))
    +
    +    def test_bg_sh_suffix_detected(self):
    +        """Hooks with .bg.sh suffix should be detected as background."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertTrue(is_background_hook("on_Snapshot__23_ssl.daemon.bg.sh"))
    +
    +    def test_legacy_background_suffix_detected(self):
    +        """Hooks with __background in stem should be detected (backwards compat)."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertTrue(is_background_hook("on_Snapshot__21_consolelog__background.js"))
    +
    +    def test_foreground_hook_not_detected(self):
    +        """Hooks without .bg. or __background should NOT be detected as background."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertFalse(is_background_hook("on_Snapshot__11_favicon.js"))
    +
    +    def test_foreground_py_hook_not_detected(self):
    +        """Python hooks without .bg. should NOT be detected as background."""
    +        from archivebox.hooks import is_background_hook
    +
    +        self.assertFalse(is_background_hook("on_Snapshot__50_wget.py"))
    +
    +
    +class TestJSONLParsing(unittest.TestCase):
    +    """Test JSONL parsing in run_hook() output processing."""
    +
    +    def test_parse_clean_jsonl(self):
    +        """Clean JSONL format should be parsed correctly."""
    +        stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}'
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(stdout)
    +
    +        self.assertEqual(len(records), 1)
    +        self.assertEqual(records[0]["type"], "ArchiveResult")
    +        self.assertEqual(records[0]["status"], "succeeded")
    +        self.assertEqual(records[0]["output_str"], "Done")
    +
    +    def test_parse_multiple_jsonl_records(self):
    +        """Multiple JSONL records should all be parsed."""
    +        stdout = """{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
    +{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}"""
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(stdout)
    +
    +        self.assertEqual(len(records), 2)
    +        self.assertEqual(records[0]["type"], "ArchiveResult")
    +        self.assertEqual(records[1]["type"], "Binary")
    +
    +    def test_parse_jsonl_with_log_output(self):
    +        """JSONL should be extracted from mixed stdout with log lines."""
    +        stdout = """Starting hook execution...
    +Processing URL: https://example.com
    +{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"}
    +Hook completed successfully"""
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(stdout)
    +
    +        self.assertEqual(len(records), 1)
    +        self.assertEqual(records[0]["status"], "succeeded")
    +
    +    def test_ignore_invalid_json(self):
    +        """Invalid JSON should be silently ignored."""
    +        stdout = """{"type": "ArchiveResult", "status": "succeeded"}
    +{invalid json here}
    +not json at all
    +{"type": "BinaryRequest", "name": "wget"}"""
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(stdout)
    +
    +        self.assertEqual(len(records), 2)
    +
    +    def test_json_without_type_ignored(self):
    +        """JSON objects without 'type' field should be ignored."""
    +        stdout = """{"status": "succeeded", "output_str": "Done"}
    +{"type": "ArchiveResult", "status": "succeeded"}"""
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(stdout)
    +
    +        self.assertEqual(len(records), 1)
    +        self.assertEqual(records[0]["type"], "ArchiveResult")
    +
    +
    +class TestRequiredBinaryConfigHandling(unittest.TestCase):
    +    """Test that required_binaries keep configured XYZ_BINARY values intact."""
    +
    +    def setUp(self):
    +        """Set up test environment."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +        self.test_hook = self.work_dir / "test_hook.py"
    +
    +    def tearDown(self):
    +        """Clean up test environment."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_binary_env_var_absolute_path_handling(self):
    +        """Absolute binary paths should pass through unchanged."""
    +        configured_binary = "/custom/path/to/wget2"
    +        binary_name = configured_binary
    +
    +        self.assertEqual(binary_name, "/custom/path/to/wget2")
    +
    +    def test_binary_env_var_name_only_handling(self):
    +        """Binary command names should pass through unchanged."""
    +        configured_binary = "wget2"
    +        binary_name = configured_binary
    +
    +        self.assertEqual(binary_name, "wget2")
    +
    +    def test_binary_env_var_empty_default(self):
    +        """Empty configured values should fall back to config defaults."""
    +        configured_binary = ""
    +        if configured_binary:
    +            binary_name = configured_binary
    +        else:
    +            binary_name = "wget"
    +
    +        self.assertEqual(binary_name, "wget")
    +
    +
    +class TestHookDiscovery(unittest.TestCase):
    +    """Test hook discovery functions."""
    +
    +    def setUp(self):
    +        """Set up test plugin directory."""
    +        self.test_dir = Path(tempfile.mkdtemp())
    +        self.plugins_dir = self.test_dir / "plugins"
    +        self.plugins_dir.mkdir()
    +
    +        # Create test plugin structure
    +        wget_dir = self.plugins_dir / "wget"
    +        wget_dir.mkdir()
    +        (wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
    +        (wget_dir / "on_BinaryRequest__10_wget.py").write_text("# binary request hook")
    +
    +        chrome_dir = self.plugins_dir / "chrome"
    +        chrome_dir.mkdir(exist_ok=True)
    +        (chrome_dir / "on_Snapshot__20_chrome_tab.daemon.bg.js").write_text("// background hook")
    +
    +        consolelog_dir = self.plugins_dir / "consolelog"
    +        consolelog_dir.mkdir()
    +        (consolelog_dir / "on_Snapshot__21_consolelog.daemon.bg.js").write_text("// background hook")
    +
    +    def tearDown(self):
    +        """Clean up test directory."""
    +        shutil.rmtree(self.test_dir, ignore_errors=True)
    +
    +    def test_discover_hooks_by_event(self):
    +        """discover_hooks() should find all hooks for an event."""
    +        # Use the local implementation since we can't easily mock BUILTIN_PLUGINS_DIR
    +        hooks = []
    +        for ext in ("sh", "py", "js"):
    +            pattern = f"*/on_Snapshot__*.{ext}"
    +            hooks.extend(self.plugins_dir.glob(pattern))
    +
    +        hooks = sorted(set(hooks), key=lambda p: p.name)
    +
    +        self.assertEqual(len(hooks), 3)
    +        hook_names = [h.name for h in hooks]
    +        self.assertIn("on_Snapshot__20_chrome_tab.daemon.bg.js", hook_names)
    +        self.assertIn("on_Snapshot__21_consolelog.daemon.bg.js", hook_names)
    +        self.assertIn("on_Snapshot__50_wget.py", hook_names)
    +
    +    def test_discover_hooks_sorted_by_name(self):
    +        """Hooks should be sorted by filename (numeric prefix ordering)."""
    +        hooks = []
    +        for ext in ("sh", "py", "js"):
    +            pattern = f"*/on_Snapshot__*.{ext}"
    +            hooks.extend(self.plugins_dir.glob(pattern))
    +
    +        hooks = sorted(set(hooks), key=lambda p: p.name)
    +
    +        # Check numeric ordering
    +        self.assertEqual(hooks[0].name, "on_Snapshot__20_chrome_tab.daemon.bg.js")
    +        self.assertEqual(hooks[1].name, "on_Snapshot__21_consolelog.daemon.bg.js")
    +        self.assertEqual(hooks[2].name, "on_Snapshot__50_wget.py")
    +
    +    def test_normalize_hook_event_name_accepts_event_classes(self):
    +        """Hook discovery should normalize bus event class names to hook families."""
    +        from archivebox import hooks as hooks_module
    +
    +        self.assertEqual(hooks_module.normalize_hook_event_name("InstallEvent"), "Install")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("BinaryRequestEvent"), "BinaryRequest")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("CrawlSetupEvent"), "CrawlSetup")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("SnapshotEvent"), "Snapshot")
    +
    +    def test_normalize_hook_event_name_strips_event_suffix_for_lifecycle_events(self):
    +        """Lifecycle event names should normalize via simple suffix stripping."""
    +        from archivebox import hooks as hooks_module
    +
    +        self.assertEqual(hooks_module.normalize_hook_event_name("BinaryEvent"), "Binary")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("CrawlEvent"), "Crawl")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("SnapshotCleanupEvent"), "SnapshotCleanup")
    +        self.assertEqual(hooks_module.normalize_hook_event_name("CrawlCleanupEvent"), "CrawlCleanup")
    +
    +    def test_discover_hooks_skips_plugins_with_disabled_required_dependencies(self):
    +        """Plugins whose required_plugins are disabled should not run."""
    +        from archivebox import hooks as hooks_module
    +
    +        chrome_dir = self.plugins_dir / "chrome"
    +        chrome_dir.mkdir(exist_ok=True)
    +        (chrome_dir / "config.json").write_text(
    +            json.dumps(
    +                {
    +                    "type": "object",
    +                    "required_plugins": [],
    +                    "properties": {
    +                        "CHROME_ENABLED": {
    +                            "type": "boolean",
    +                            "default": True,
    +                            "x-aliases": ["USE_CHROME"],
    +                        },
    +                    },
    +                },
    +            ),
    +        )
    +        (chrome_dir / "on_Snapshot__20_chrome.js").write_text("// chrome hook")
    +
    +        accessibility_dir = self.plugins_dir / "accessibility"
    +        accessibility_dir.mkdir(exist_ok=True)
    +        (accessibility_dir / "config.json").write_text(
    +            json.dumps(
    +                {
    +                    "type": "object",
    +                    "required_plugins": ["chrome"],
    +                    "properties": {
    +                        "ACCESSIBILITY_ENABLED": {
    +                            "type": "boolean",
    +                            "default": True,
    +                        },
    +                    },
    +                },
    +            ),
    +        )
    +        (accessibility_dir / "on_Snapshot__10_accessibility.js").write_text("// accessibility hook")
    +
    +        wget_dir = self.plugins_dir / "wget"
    +        (wget_dir / "config.json").write_text(
    +            json.dumps(
    +                {
    +                    "type": "object",
    +                    "required_plugins": [],
    +                    "properties": {
    +                        "WGET_ENABLED": {
    +                            "type": "boolean",
    +                            "default": True,
    +                            "x-aliases": ["SAVE_WGET"],
    +                        },
    +                    },
    +                },
    +            ),
    +        )
    +
    +        with (
    +            patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
    +            patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
    +        ):
    +            hooks = hooks_module.discover_hooks("Snapshot", config={"CHROME_ENABLED": False, "WGET_ENABLED": True})
    +
    +        hook_names = [hook.parent.name for hook in hooks]
    +        self.assertIn("wget", hook_names)
    +        self.assertNotIn("chrome", hook_names)
    +        self.assertNotIn("accessibility", hook_names)
    +
    +    def test_get_plugins_includes_non_snapshot_plugin_dirs(self):
    +        """get_plugins() should include binary-only plugins with standardized metadata."""
    +        env_dir = self.plugins_dir / "env"
    +        env_dir.mkdir()
    +        (env_dir / "on_BinaryRequest__15_env.py").write_text("# binary hook")
    +        (env_dir / "config.json").write_text('{"type": "object", "properties": {}}')
    +
    +        from archivebox import hooks as hooks_module
    +
    +        hooks_module.get_plugins.cache_clear()
    +        with (
    +            patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
    +            patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
    +        ):
    +            plugins = hooks_module.get_plugins()
    +
    +        self.assertIn("env", plugins)
    +
    +    def test_discover_binary_hooks_ignores_plugins_whitelist(self):
    +        """Binary provider hooks should remain discoverable under --plugins filtering."""
    +        singlefile_dir = self.plugins_dir / "singlefile"
    +        singlefile_dir.mkdir()
    +        (singlefile_dir / "config.json").write_text(
    +            json.dumps(
    +                {
    +                    "type": "object",
    +                    "required_plugins": ["chrome"],
    +                    "properties": {},
    +                },
    +            ),
    +        )
    +
    +        npm_dir = self.plugins_dir / "npm"
    +        npm_dir.mkdir()
    +        (npm_dir / "on_BinaryRequest__10_npm.py").write_text("# npm binary hook")
    +        (npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
    +
    +        from archivebox import hooks as hooks_module
    +
    +        hooks_module.get_plugins.cache_clear()
    +        with (
    +            patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
    +            patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
    +        ):
    +            hooks = hooks_module.discover_hooks("BinaryRequest", config={"PLUGINS": "singlefile"})
    +
    +        hook_names = [hook.name for hook in hooks]
    +        self.assertIn("on_BinaryRequest__10_npm.py", hook_names)
    +
    +    def test_discover_hooks_accepts_event_class_names(self):
    +        """discover_hooks should accept BinaryRequestEvent / SnapshotEvent class names."""
    +        from archivebox import hooks as hooks_module
    +
    +        hooks_module.get_plugins.cache_clear()
    +        with (
    +            patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
    +            patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
    +        ):
    +            binary_hooks = hooks_module.discover_hooks("BinaryRequestEvent", filter_disabled=False)
    +            snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False)
    +
    +        self.assertIn("on_BinaryRequest__10_wget.py", [hook.name for hook in binary_hooks])
    +        self.assertIn("on_Snapshot__50_wget.py", [hook.name for hook in snapshot_hooks])
    +
    +    def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self):
    +        """Lifecycle events without a hook family should return no hooks."""
    +        from archivebox import hooks as hooks_module
    +
    +        hooks_module.get_plugins.cache_clear()
    +        with (
    +            patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
    +            patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
    +        ):
    +            self.assertEqual(hooks_module.discover_hooks("BinaryEvent", filter_disabled=False), [])
    +            self.assertEqual(hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False), [])
    +
    +
    +class TestGetExtractorName(unittest.TestCase):
    +    """Test get_extractor_name() function."""
    +
    +    def test_strip_numeric_prefix(self):
    +        """Numeric prefix should be stripped from extractor name."""
    +
    +        # Inline implementation of get_extractor_name
    +        def get_extractor_name(extractor: str) -> str:
    +            parts = extractor.split("_", 1)
    +            if len(parts) == 2 and parts[0].isdigit():
    +                return parts[1]
    +            return extractor
    +
    +        self.assertEqual(get_extractor_name("10_title"), "title")
    +        self.assertEqual(get_extractor_name("26_readability"), "readability")
    +        self.assertEqual(get_extractor_name("50_parse_html_urls"), "parse_html_urls")
    +
    +    def test_no_prefix_unchanged(self):
    +        """Extractor without numeric prefix should be unchanged."""
    +
    +        def get_extractor_name(extractor: str) -> str:
    +            parts = extractor.split("_", 1)
    +            if len(parts) == 2 and parts[0].isdigit():
    +                return parts[1]
    +            return extractor
    +
    +        self.assertEqual(get_extractor_name("title"), "title")
    +        self.assertEqual(get_extractor_name("readability"), "readability")
    +
    +
    +class TestHookExecution(unittest.TestCase):
    +    """Test hook execution with real subprocesses."""
    +
    +    def setUp(self):
    +        """Set up test environment."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +
    +    def tearDown(self):
    +        """Clean up test environment."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_python_hook_execution(self):
    +        """Python hook should execute and output JSONL."""
    +        hook_path = self.work_dir / "test_hook.py"
    +        hook_path.write_text("""#!/usr/bin/env python3
    +import json
    +print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"}))
    +""")
    +
    +        result = subprocess.run(
    +            ["python3", str(hook_path)],
    +            cwd=str(self.work_dir),
    +            capture_output=True,
    +            text=True,
    +        )
    +
    +        self.assertEqual(result.returncode, 0)
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(result.stdout)
    +        self.assertTrue(records)
    +        self.assertEqual(records[0]["type"], "ArchiveResult")
    +        self.assertEqual(records[0]["status"], "succeeded")
    +
    +    def test_js_hook_execution(self):
    +        """JavaScript hook should execute and output JSONL."""
    +        self.assertIsNotNone(shutil.which("node"), "Node.js not available")
    +
    +        hook_path = self.work_dir / "test_hook.js"
    +        hook_path.write_text("""#!/usr/bin/env node
    +console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'}));
    +""")
    +
    +        result = subprocess.run(
    +            ["node", str(hook_path)],
    +            cwd=str(self.work_dir),
    +            capture_output=True,
    +            text=True,
    +        )
    +
    +        self.assertEqual(result.returncode, 0)
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(result.stdout)
    +        self.assertTrue(records)
    +        self.assertEqual(records[0]["type"], "ArchiveResult")
    +        self.assertEqual(records[0]["status"], "succeeded")
    +
    +    def test_hook_receives_cli_args(self):
    +        """Hook should receive CLI arguments."""
    +        hook_path = self.work_dir / "test_hook.py"
    +        hook_path.write_text("""#!/usr/bin/env python3
    +import sys
    +import json
    +# Simple arg parsing
    +args = {}
    +for arg in sys.argv[1:]:
    +    if arg.startswith('--') and '=' in arg:
    +        key, val = arg[2:].split('=', 1)
    +        args[key.replace('-', '_')] = val
    +print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")}))
    +""")
    +
    +        result = subprocess.run(
    +            ["python3", str(hook_path), "--url=https://example.com"],
    +            cwd=str(self.work_dir),
    +            capture_output=True,
    +            text=True,
    +        )
    +
    +        self.assertEqual(result.returncode, 0)
    +        from archivebox.machine.models import Process
    +
    +        records = Process.parse_records_from_text(result.stdout)
    +        self.assertTrue(records)
    +        self.assertEqual(records[0]["url"], "https://example.com")
    +
    +
    +class TestDependencyRecordOutput(unittest.TestCase):
    +    """Test dependency record output format compliance."""
    +
    +    def setUp(self):
    +        """Set up test environment."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +
    +    def tearDown(self):
    +        """Clean up test environment."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_dependency_record_outputs_binary(self):
    +        """Dependency resolution should output Binary JSONL when binary is found."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "Binary",
    +                "name": "wget",
    +                "abspath": "/usr/bin/wget",
    +                "version": "1.21.3",
    +                "sha256": None,
    +                "binprovider": "apt",
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["type"], "Binary")
    +        self.assertEqual(data["name"], "wget")
    +        self.assertTrue(data["abspath"].startswith("/"))
    +
    +    def test_dependency_record_outputs_binary_jsonl(self):
    +        """Dependency resolution should output Binary JSONL."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "Binary",
    +                "name": "wget",
    +                "abspath": "/usr/bin/wget",
    +                "version": "1.21.3",
    +                "binprovider": "env",
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["type"], "Binary")
    +        self.assertEqual(data["name"], "wget")
    +        self.assertEqual(data["abspath"], "/usr/bin/wget")
    +
    +
    +class TestSnapshotHookOutput(unittest.TestCase):
    +    """Test snapshot hook output format compliance."""
    +
    +    def test_snapshot_hook_basic_output(self):
    +        """Snapshot hook should output clean ArchiveResult JSONL."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "ArchiveResult",
    +                "status": "succeeded",
    +                "output_str": "Downloaded 5 files",
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["type"], "ArchiveResult")
    +        self.assertEqual(data["status"], "succeeded")
    +        self.assertIn("output_str", data)
    +
    +    def test_snapshot_hook_with_cmd(self):
    +        """Snapshot hook should include cmd for binary FK lookup."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "ArchiveResult",
    +                "status": "succeeded",
    +                "output_str": "Archived with wget",
    +                "cmd": ["/usr/bin/wget", "-p", "-k", "https://example.com"],
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["type"], "ArchiveResult")
    +        self.assertIsInstance(data["cmd"], list)
    +        self.assertEqual(data["cmd"][0], "/usr/bin/wget")
    +
    +    def test_snapshot_hook_with_output_json(self):
    +        """Snapshot hook can include structured metadata in output_json."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "ArchiveResult",
    +                "status": "succeeded",
    +                "output_str": "Got headers",
    +                "output_json": {
    +                    "content-type": "text/html",
    +                    "server": "nginx",
    +                    "status-code": 200,
    +                },
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["type"], "ArchiveResult")
    +        self.assertIsInstance(data["output_json"], dict)
    +        self.assertEqual(data["output_json"]["status-code"], 200)
    +
    +    def test_snapshot_hook_skipped_status(self):
    +        """Snapshot hook should support skipped status."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "ArchiveResult",
    +                "status": "skipped",
    +                "output_str": "SAVE_WGET=False",
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["status"], "skipped")
    +
    +    def test_snapshot_hook_failed_status(self):
    +        """Snapshot hook should support failed status."""
    +        hook_output = json.dumps(
    +            {
    +                "type": "ArchiveResult",
    +                "status": "failed",
    +                "output_str": "404 Not Found",
    +            },
    +        )
    +
    +        from archivebox.machine.models import Process
    +
    +        data = Process.parse_records_from_text(hook_output)[0]
    +        self.assertEqual(data["status"], "failed")
    +
    +
    +class TestPluginMetadata(unittest.TestCase):
    +    """Test that plugin metadata is added to JSONL records."""
    +
    +    def test_plugin_name_added(self):
    +        """run_hook() should add plugin name to records."""
    +        # Simulate what run_hook() does
    +        script = Path("/abx_plugins/plugins/wget/on_Snapshot__50_wget.py")
    +        plugin_name = script.parent.name
    +
    +        record = {"type": "ArchiveResult", "status": "succeeded"}
    +        record["plugin"] = plugin_name
    +        record["plugin_hook"] = str(script)
    +
    +        self.assertEqual(record["plugin"], "wget")
    +        self.assertIn("on_Snapshot__50_wget.py", record["plugin_hook"])
    +
    +
    +if __name__ == "__main__":
    +    unittest.main()
    diff --git a/archivebox/tests/test_machine_models.py b/archivebox/tests/test_machine_models.py
    new file mode 100644
    index 0000000000..773668ff33
    --- /dev/null
    +++ b/archivebox/tests/test_machine_models.py
    @@ -0,0 +1,840 @@
    +"""
    +Unit tests for machine module models: Machine, NetworkInterface, Binary, Process.
    +
    +Tests cover:
    +1. Machine model creation and current() method
    +2. NetworkInterface model and network detection
    +3. Binary model lifecycle and state machine
    +4. Process model lifecycle, hierarchy, and state machine
    +5. JSONL serialization/deserialization
    +6. Manager methods
    +7. Process tracking methods (replacing pid_utils)
    +"""
    +
    +import os
    +from datetime import timedelta
    +from typing import cast
    +from unittest.mock import Mock, patch
    +
    +import pytest
    +from django.test import TestCase
    +from django.utils import timezone
    +
    +from archivebox.machine.models import (
    +    BinaryManager,
    +    Machine,
    +    NetworkInterface,
    +    Binary,
    +    Process,
    +    BinaryMachine,
    +    ProcessMachine,
    +    MACHINE_RECHECK_INTERVAL,
    +    PID_REUSE_WINDOW,
    +    PROCESS_TIMEOUT_GRACE,
    +)
    +
    +
    +class TestMachineModel(TestCase):
    +    """Test the Machine model."""
    +
    +    def setUp(self):
    +        """Reset cached machine between tests."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +
    +    def test_machine_current_creates_machine(self):
    +        """Machine.current() should create a machine if none exists."""
    +        machine = Machine.current()
    +
    +        self.assertIsNotNone(machine)
    +        self.assertIsNotNone(machine.id)
    +        self.assertIsNotNone(machine.guid)
    +        self.assertEqual(machine.hostname, os.uname().nodename)
    +        self.assertIn(machine.os_family, ["linux", "darwin", "windows", "freebsd"])
    +
    +    def test_machine_current_returns_cached(self):
    +        """Machine.current() should return cached machine within recheck interval."""
    +        machine1 = Machine.current()
    +        machine2 = Machine.current()
    +
    +        self.assertEqual(machine1.id, machine2.id)
    +
    +    def test_machine_current_refreshes_after_interval(self):
    +        """Machine.current() should refresh after recheck interval."""
    +        import archivebox.machine.models as models
    +
    +        machine1 = Machine.current()
    +
    +        # Manually expire the cache by modifying modified_at
    +        machine1.modified_at = timezone.now() - timedelta(seconds=MACHINE_RECHECK_INTERVAL + 1)
    +        machine1.save()
    +        models._CURRENT_MACHINE = machine1
    +
    +        machine2 = Machine.current()
    +
    +        # Should have fetched/updated the machine (same GUID)
    +        self.assertEqual(machine1.guid, machine2.guid)
    +
    +    def test_machine_current_recreates_stale_cached_row(self):
    +        """Machine.current() should recreate the cached machine if the row was deleted."""
    +        import archivebox.machine.models as models
    +
    +        machine1 = Machine.current()
    +        machine1_id = machine1.id
    +        machine1_guid = machine1.guid
    +
    +        machine1.delete()
    +        models._CURRENT_MACHINE = machine1
    +
    +        machine2 = Machine.current()
    +
    +        self.assertNotEqual(machine1_id, machine2.id)
    +        self.assertEqual(machine1_guid, machine2.guid)
    +
    +    def test_machine_from_jsonl_update(self):
    +        """Machine.from_json() should update machine config."""
    +        Machine.current()  # Ensure machine exists
    +        record = {
    +            "config": {
    +                "WGET_BINARY": "/usr/bin/wget",
    +            },
    +        }
    +
    +        result = Machine.from_json(record)
    +
    +        self.assertIsNotNone(result)
    +        assert result is not None
    +        self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
    +
    +    def test_machine_from_jsonl_strips_legacy_chromium_version(self):
    +        """Machine.from_json() should ignore legacy browser version keys."""
    +        Machine.current()  # Ensure machine exists
    +        record = {
    +            "config": {
    +                "WGET_BINARY": "/usr/bin/wget",
    +                "CHROMIUM_VERSION": "123.4.5",
    +            },
    +        }
    +
    +        result = Machine.from_json(record)
    +
    +        self.assertIsNotNone(result)
    +        assert result is not None
    +        self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
    +        self.assertNotIn("CHROMIUM_VERSION", result.config)
    +
    +    def test_machine_from_jsonl_invalid(self):
    +        """Machine.from_json() should return None for invalid records."""
    +        result = Machine.from_json({"invalid": "record"})
    +        self.assertIsNone(result)
    +
    +    def test_machine_current_strips_legacy_chromium_version(self):
    +        """Machine.current() should clean legacy browser version keys from persisted config."""
    +        import archivebox.machine.models as models
    +
    +        machine = Machine.current()
    +        machine.config = {
    +            "CHROME_BINARY": "/tmp/chromium",
    +            "CHROMIUM_VERSION": "123.4.5",
    +        }
    +        machine.save(update_fields=["config"])
    +        models._CURRENT_MACHINE = machine
    +
    +        refreshed = Machine.current()
    +
    +        self.assertEqual(refreshed.config.get("CHROME_BINARY"), "/tmp/chromium")
    +        self.assertNotIn("CHROMIUM_VERSION", refreshed.config)
    +
    +    def test_machine_manager_current(self):
    +        """Machine.objects.current() should return current machine."""
    +        machine = Machine.current()
    +        self.assertIsNotNone(machine)
    +        self.assertEqual(machine.id, Machine.current().id)
    +
    +
    +class TestNetworkInterfaceModel(TestCase):
    +    """Test the NetworkInterface model."""
    +
    +    def setUp(self):
    +        """Reset cached interface between tests."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        models._CURRENT_INTERFACE = None
    +
    +    def test_networkinterface_current_creates_interface(self):
    +        """NetworkInterface.current() should create an interface if none exists."""
    +        interface = NetworkInterface.current()
    +
    +        self.assertIsNotNone(interface)
    +        self.assertIsNotNone(interface.id)
    +        self.assertIsNotNone(interface.machine)
    +        self.assertIsNotNone(interface.ip_local)
    +
    +    def test_networkinterface_current_returns_cached(self):
    +        """NetworkInterface.current() should return cached interface within recheck interval."""
    +        interface1 = NetworkInterface.current()
    +        interface2 = NetworkInterface.current()
    +
    +        self.assertEqual(interface1.id, interface2.id)
    +
    +    def test_networkinterface_manager_current(self):
    +        """NetworkInterface.objects.current() should return current interface."""
    +        interface = NetworkInterface.current()
    +        self.assertIsNotNone(interface)
    +
    +    def test_networkinterface_current_refresh_creates_new_interface_when_properties_change(self):
    +        """Refreshing should persist a new NetworkInterface row when the host network fingerprint changes."""
    +        import archivebox.machine.models as models
    +
    +        first = {
    +            "mac_address": "aa:bb:cc:dd:ee:01",
    +            "ip_public": "1.1.1.1",
    +            "ip_local": "192.168.1.10",
    +            "dns_server": "8.8.8.8",
    +            "hostname": "host-a",
    +            "iface": "en0",
    +            "isp": "ISP A",
    +            "city": "City",
    +            "region": "Region",
    +            "country": "Country",
    +        }
    +        second = {
    +            **first,
    +            "ip_public": "2.2.2.2",
    +            "ip_local": "10.0.0.5",
    +        }
    +
    +        with patch.object(models, "get_host_network", side_effect=[first, second]):
    +            interface1 = NetworkInterface.current(refresh=True)
    +            interface2 = NetworkInterface.current(refresh=True)
    +
    +        self.assertNotEqual(interface1.id, interface2.id)
    +        self.assertEqual(interface1.machine_id, interface2.machine_id)
    +        self.assertEqual(NetworkInterface.objects.filter(machine=interface1.machine).count(), 2)
    +
    +
    +class TestBinaryModel(TestCase):
    +    """Test the Binary model."""
    +
    +    def setUp(self):
    +        """Reset cached binaries and create a machine."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        models._CURRENT_BINARIES = {}
    +        self.machine = Machine.current()
    +
    +    def test_binary_creation(self):
    +        """Binary should be created with default values."""
    +        binary = Binary.objects.create(
    +            machine=self.machine,
    +            name="wget",
    +            binproviders="apt,brew,env",
    +        )
    +
    +        self.assertIsNotNone(binary.id)
    +        self.assertEqual(binary.name, "wget")
    +        self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
    +        self.assertFalse(binary.is_valid)
    +
    +    def test_binary_is_valid(self):
    +        """Binary.is_valid should be True for installed binaries with a resolved path."""
    +        binary = Binary.objects.create(
    +            machine=self.machine,
    +            name="wget",
    +            abspath="/usr/bin/wget",
    +            version="1.21",
    +            status=Binary.StatusChoices.INSTALLED,
    +        )
    +
    +        self.assertTrue(binary.is_valid)
    +
    +    def test_binary_manager_get_valid_binary(self):
    +        """BinaryManager.get_valid_binary() should find valid binaries."""
    +        # Create invalid binary (no abspath)
    +        Binary.objects.create(machine=self.machine, name="wget")
    +
    +        # Create valid binary
    +        Binary.objects.create(
    +            machine=self.machine,
    +            name="wget",
    +            abspath="/usr/bin/wget",
    +            version="1.21",
    +            status=Binary.StatusChoices.INSTALLED,
    +        )
    +
    +        result = cast(BinaryManager, Binary.objects).get_valid_binary("wget")
    +
    +        self.assertIsNotNone(result)
    +        assert result is not None
    +        self.assertEqual(result.abspath, "/usr/bin/wget")
    +
    +    def test_binary_update_and_requeue(self):
    +        """Binary.update_and_requeue() should update fields and save."""
    +        binary = Binary.objects.create(machine=self.machine, name="test")
    +        old_modified = binary.modified_at
    +
    +        binary.update_and_requeue(
    +            status=Binary.StatusChoices.QUEUED,
    +            retry_at=timezone.now() + timedelta(seconds=60),
    +        )
    +
    +        binary.refresh_from_db()
    +        self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
    +        self.assertGreater(binary.modified_at, old_modified)
    +
    +    def test_binary_from_json_preserves_provider_overrides(self):
    +        """Binary.from_json() should persist provider overrides unchanged."""
    +        overrides = {
    +            "apt": {"install_args": ["chromium"]},
    +            "npm": {"install_args": "puppeteer"},
    +            "custom": {"install": "bash -lc 'echo ok'"},
    +        }
    +
    +        binary = Binary.from_json(
    +            {
    +                "name": "chrome",
    +                "binproviders": "apt,npm,custom",
    +                "overrides": overrides,
    +            },
    +        )
    +
    +        self.assertIsNotNone(binary)
    +        assert binary is not None
    +        self.assertEqual(binary.overrides, overrides)
    +
    +    def test_binary_from_json_does_not_coerce_legacy_override_shapes(self):
    +        """Binary.from_json() should no longer translate legacy non-dict provider overrides."""
    +        overrides = {
    +            "apt": ["chromium"],
    +            "npm": "puppeteer",
    +        }
    +
    +        binary = Binary.from_json(
    +            {
    +                "name": "chrome",
    +                "binproviders": "apt,npm",
    +                "overrides": overrides,
    +            },
    +        )
    +
    +        self.assertIsNotNone(binary)
    +        assert binary is not None
    +        self.assertEqual(binary.overrides, overrides)
    +
    +    def test_binary_from_json_prefers_published_readability_package(self):
    +        """Binary.from_json() should rewrite readability's npm git URL to the published package."""
    +        binary = Binary.from_json(
    +            {
    +                "name": "readability-extractor",
    +                "binproviders": "env,npm",
    +                "overrides": {
    +                    "npm": {
    +                        "install_args": ["https://github.com/ArchiveBox/readability-extractor"],
    +                    },
    +                },
    +            },
    +        )
    +
    +        self.assertIsNotNone(binary)
    +        assert binary is not None
    +        self.assertEqual(
    +            binary.overrides,
    +            {
    +                "npm": {
    +                    "install_args": ["readability-extractor"],
    +                },
    +            },
    +        )
    +
    +
    +class TestBinaryStateMachine(TestCase):
    +    """Test the BinaryMachine state machine."""
    +
    +    def setUp(self):
    +        """Create a machine and binary for state machine tests."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        self.machine = Machine.current()
    +        self.binary = Binary.objects.create(
    +            machine=self.machine,
    +            name="test-binary",
    +            binproviders="env",
    +        )
    +
    +    def test_binary_state_machine_initial_state(self):
    +        """BinaryMachine should start in queued state."""
    +        sm = BinaryMachine(self.binary)
    +        self.assertEqual(sm.current_state_value, Binary.StatusChoices.QUEUED)
    +
    +    def test_binary_state_machine_can_start(self):
    +        """BinaryMachine.can_start() should check name and binproviders."""
    +        sm = BinaryMachine(self.binary)
    +        self.assertTrue(sm.can_install())
    +
    +        self.binary.binproviders = ""
    +        self.binary.save()
    +        sm = BinaryMachine(self.binary)
    +        self.assertFalse(sm.can_install())
    +
    +
    +class TestProcessModel(TestCase):
    +    """Test the Process model."""
    +
    +    def setUp(self):
    +        """Create a machine for process tests."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        models._CURRENT_PROCESS = None
    +        self.machine = Machine.current()
    +
    +    def test_process_creation(self):
    +        """Process should be created with default values."""
    +        process = Process.objects.create(
    +            machine=self.machine,
    +            cmd=["echo", "hello"],
    +            pwd="/tmp",
    +        )
    +
    +        self.assertIsNotNone(process.id)
    +        self.assertEqual(process.cmd, ["echo", "hello"])
    +        self.assertEqual(process.status, Process.StatusChoices.QUEUED)
    +        self.assertIsNone(process.pid)
    +        self.assertIsNone(process.exit_code)
    +
    +    def test_process_to_jsonl(self):
    +        """Process.to_json() should serialize correctly."""
    +        process = Process.objects.create(
    +            machine=self.machine,
    +            cmd=["echo", "hello"],
    +            pwd="/tmp",
    +            timeout=60,
    +        )
    +        json_data = process.to_json()
    +
    +        self.assertEqual(json_data["type"], "Process")
    +        self.assertEqual(json_data["cmd"], ["echo", "hello"])
    +        self.assertEqual(json_data["pwd"], "/tmp")
    +        self.assertEqual(json_data["timeout"], 60)
    +
    +    def test_process_update_and_requeue(self):
    +        """Process.update_and_requeue() should update fields and save."""
    +        process = Process.objects.create(machine=self.machine, cmd=["test"])
    +
    +        process.update_and_requeue(
    +            status=Process.StatusChoices.RUNNING,
    +            pid=12345,
    +            started_at=timezone.now(),
    +        )
    +
    +        process.refresh_from_db()
    +        self.assertEqual(process.status, Process.StatusChoices.RUNNING)
    +        self.assertEqual(process.pid, 12345)
    +        self.assertIsNotNone(process.started_at)
    +
    +
    +class TestProcessCurrent(TestCase):
    +    """Test Process.current() method."""
    +
    +    def setUp(self):
    +        """Reset caches."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        models._CURRENT_PROCESS = None
    +
    +    def test_process_current_creates_record(self):
    +        """Process.current() should create a Process for current PID."""
    +        proc = Process.current()
    +
    +        self.assertIsNotNone(proc)
    +        self.assertEqual(proc.pid, os.getpid())
    +        self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
    +        self.assertIsNotNone(proc.machine)
    +        self.assertIsNotNone(proc.iface)
    +        self.assertEqual(proc.iface.machine_id, proc.machine_id)
    +        self.assertIsNotNone(proc.started_at)
    +
    +    def test_process_current_caches(self):
    +        """Process.current() should cache the result."""
    +        proc1 = Process.current()
    +        proc2 = Process.current()
    +
    +        self.assertEqual(proc1.id, proc2.id)
    +
    +    def test_process_detect_type_runner(self):
    +        """_detect_process_type should detect the background runner command."""
    +        with patch("sys.argv", ["archivebox", "run", "--daemon"]):
    +            result = Process._detect_process_type()
    +            self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
    +
    +    def test_process_detect_type_runner_watch(self):
    +        """runner_watch should be classified as a worker, not the orchestrator itself."""
    +        with patch("sys.argv", ["archivebox", "manage", "runner_watch", "--pidfile=/tmp/runserver.pid"]):
    +            result = Process._detect_process_type()
    +            self.assertEqual(result, Process.TypeChoices.WORKER)
    +
    +    def test_process_detect_type_cli(self):
    +        """_detect_process_type should detect CLI commands."""
    +        with patch("sys.argv", ["archivebox", "add", "http://example.com"]):
    +            result = Process._detect_process_type()
    +            self.assertEqual(result, Process.TypeChoices.CLI)
    +
    +    def test_process_detect_type_binary(self):
    +        """_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
    +        with patch("sys.argv", ["/usr/bin/wget", "https://example.com"]):
    +            result = Process._detect_process_type()
    +            self.assertEqual(result, Process.TypeChoices.BINARY)
    +
    +    def test_process_proc_allows_interpreter_wrapped_script(self):
    +        """Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
    +        proc = Process.objects.create(
    +            machine=Machine.current(),
    +            cmd=["/tmp/on_CrawlSetup__90_chrome_launch.daemon.bg.js", "--url=https://example.com/"],
    +            pid=12345,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +
    +        os_proc = Mock()
    +        os_proc.create_time.return_value = proc.started_at.timestamp()
    +        os_proc.cmdline.return_value = [
    +            "node",
    +            "/tmp/on_CrawlSetup__90_chrome_launch.daemon.bg.js",
    +            "--url=https://example.com/",
    +        ]
    +
    +        with patch("archivebox.machine.models.psutil.Process", return_value=os_proc):
    +            self.assertIs(proc.proc, os_proc)
    +
    +
    +class TestProcessHierarchy(TestCase):
    +    """Test Process parent/child relationships."""
    +
    +    def setUp(self):
    +        """Create machine."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        self.machine = Machine.current()
    +
    +    def test_process_parent_child(self):
    +        """Process should track parent/child relationships."""
    +        parent = Process.objects.create(
    +            machine=self.machine,
    +            process_type=Process.TypeChoices.CLI,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=1,
    +            started_at=timezone.now(),
    +        )
    +
    +        child = Process.objects.create(
    +            machine=self.machine,
    +            parent=parent,
    +            process_type=Process.TypeChoices.WORKER,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=2,
    +            started_at=timezone.now(),
    +        )
    +
    +        self.assertEqual(child.parent, parent)
    +        self.assertIn(child, parent.children.all())
    +
    +    def test_process_root(self):
    +        """Process.root should return the root of the hierarchy."""
    +        root = Process.objects.create(
    +            machine=self.machine,
    +            process_type=Process.TypeChoices.CLI,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +        child = Process.objects.create(
    +            machine=self.machine,
    +            parent=root,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +        grandchild = Process.objects.create(
    +            machine=self.machine,
    +            parent=child,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +
    +        self.assertEqual(grandchild.root, root)
    +        self.assertEqual(child.root, root)
    +        self.assertEqual(root.root, root)
    +
    +    def test_process_depth(self):
    +        """Process.depth should return depth in tree."""
    +        root = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +        child = Process.objects.create(
    +            machine=self.machine,
    +            parent=root,
    +            status=Process.StatusChoices.RUNNING,
    +            started_at=timezone.now(),
    +        )
    +
    +        self.assertEqual(root.depth, 0)
    +        self.assertEqual(child.depth, 1)
    +
    +
    +class TestProcessLifecycle(TestCase):
    +    """Test Process lifecycle methods."""
    +
    +    def setUp(self):
    +        """Create machine."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        self.machine = Machine.current()
    +
    +    def test_process_is_running_current_pid(self):
    +        """is_running should be True for current PID."""
    +        import psutil
    +        from datetime import datetime
    +
    +        proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=os.getpid(),
    +            started_at=proc_start,
    +        )
    +
    +        self.assertTrue(proc.is_running)
    +
    +    def test_process_is_running_fake_pid(self):
    +        """is_running should be False for non-existent PID."""
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999999,
    +            started_at=timezone.now(),
    +        )
    +
    +        self.assertFalse(proc.is_running)
    +
    +    def test_process_poll_detects_exit(self):
    +        """poll() should detect exited process."""
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999999,
    +            started_at=timezone.now(),
    +        )
    +
    +        exit_code = proc.poll()
    +
    +        self.assertIsNotNone(exit_code)
    +        proc.refresh_from_db()
    +        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
    +
    +    def test_process_poll_normalizes_negative_exit_code(self):
    +        """poll() should normalize -1 exit codes to 137."""
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.EXITED,
    +            pid=999999,
    +            exit_code=-1,
    +            started_at=timezone.now(),
    +        )
    +
    +        exit_code = proc.poll()
    +
    +        self.assertEqual(exit_code, 137)
    +        proc.refresh_from_db()
    +        self.assertEqual(proc.exit_code, 137)
    +
    +    def test_process_terminate_dead_process(self):
    +        """terminate() should handle already-dead process."""
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999999,
    +            started_at=timezone.now(),
    +        )
    +
    +        result = proc.terminate()
    +
    +        self.assertFalse(result)
    +        proc.refresh_from_db()
    +        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
    +
    +
    +class TestProcessClassMethods(TestCase):
    +    """Test Process class methods for querying."""
    +
    +    def setUp(self):
    +        """Create machine."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        self.machine = Machine.current()
    +
    +    def test_get_running(self):
    +        """get_running should return running processes."""
    +        proc = Process.objects.create(
    +            machine=self.machine,
    +            process_type=Process.TypeChoices.HOOK,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=99999,
    +            started_at=timezone.now(),
    +        )
    +
    +        running = Process.get_running(process_type=Process.TypeChoices.HOOK)
    +
    +        self.assertIn(proc, running)
    +
    +    def test_get_running_count(self):
    +        """get_running_count should count running processes."""
    +        for i in range(3):
    +            Process.objects.create(
    +                machine=self.machine,
    +                process_type=Process.TypeChoices.HOOK,
    +                status=Process.StatusChoices.RUNNING,
    +                pid=99900 + i,
    +                started_at=timezone.now(),
    +            )
    +
    +        count = Process.get_running_count(process_type=Process.TypeChoices.HOOK)
    +        self.assertGreaterEqual(count, 3)
    +
    +    def test_cleanup_stale_running(self):
    +        """cleanup_stale_running should mark stale processes as exited."""
    +        stale = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999999,
    +            started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
    +        )
    +
    +        cleaned = Process.cleanup_stale_running()
    +
    +        self.assertGreaterEqual(cleaned, 1)
    +        stale.refresh_from_db()
    +        self.assertEqual(stale.status, Process.StatusChoices.EXITED)
    +
    +    def test_cleanup_stale_running_marks_timed_out_rows_exited(self):
    +        """cleanup_stale_running should retire RUNNING rows that exceed timeout + grace."""
    +        stale = Process.objects.create(
    +            machine=self.machine,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999998,
    +            timeout=5,
    +            started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
    +        )
    +
    +        cleaned = Process.cleanup_stale_running()
    +
    +        self.assertGreaterEqual(cleaned, 1)
    +        stale.refresh_from_db()
    +        self.assertEqual(stale.status, Process.StatusChoices.EXITED)
    +
    +    def test_cleanup_stale_running_marks_timed_out_live_hooks_exited(self):
    +        """Timed-out live hook rows should be retired in the DB without trying to kill the process."""
    +        stale = Process.objects.create(
    +            machine=self.machine,
    +            process_type=Process.TypeChoices.HOOK,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=os.getpid(),
    +            timeout=5,
    +            started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
    +        )
    +
    +        with (
    +            patch.object(Process, "poll", return_value=None),
    +            patch.object(Process, "kill_tree") as kill_tree,
    +            patch.object(Process, "terminate") as terminate,
    +        ):
    +            cleaned = Process.cleanup_stale_running()
    +
    +        self.assertGreaterEqual(cleaned, 1)
    +        stale.refresh_from_db()
    +        self.assertEqual(stale.status, Process.StatusChoices.EXITED)
    +        kill_tree.assert_not_called()
    +        terminate.assert_not_called()
    +
    +    def test_cleanup_orphaned_workers_marks_dead_root_children_exited(self):
    +        """cleanup_orphaned_workers should retire rows whose CLI/orchestrator root is gone."""
    +        import psutil
    +        from datetime import datetime
    +
    +        started_at = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
    +        parent = Process.objects.create(
    +            machine=self.machine,
    +            process_type=Process.TypeChoices.CLI,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=999997,
    +            started_at=timezone.now() - timedelta(minutes=5),
    +        )
    +        child = Process.objects.create(
    +            machine=self.machine,
    +            parent=parent,
    +            process_type=Process.TypeChoices.HOOK,
    +            status=Process.StatusChoices.RUNNING,
    +            pid=os.getpid(),
    +            started_at=started_at,
    +        )
    +
    +        with patch.object(Process, "kill_tree") as kill_tree, patch.object(Process, "terminate") as terminate:
    +            cleaned = Process.cleanup_orphaned_workers()
    +
    +        self.assertEqual(cleaned, 1)
    +        child.refresh_from_db()
    +        self.assertEqual(child.status, Process.StatusChoices.EXITED)
    +        kill_tree.assert_not_called()
    +        terminate.assert_not_called()
    +
    +
    +class TestProcessStateMachine(TestCase):
    +    """Test the ProcessMachine state machine."""
    +
    +    def setUp(self):
    +        """Create a machine and process for state machine tests."""
    +        import archivebox.machine.models as models
    +
    +        models._CURRENT_MACHINE = None
    +        self.machine = Machine.current()
    +        self.process = Process.objects.create(
    +            machine=self.machine,
    +            cmd=["echo", "test"],
    +            pwd="/tmp",
    +        )
    +
    +    def test_process_state_machine_initial_state(self):
    +        """ProcessMachine should start in queued state."""
    +        sm = ProcessMachine(self.process)
    +        self.assertEqual(sm.current_state_value, Process.StatusChoices.QUEUED)
    +
    +    def test_process_state_machine_can_start(self):
    +        """ProcessMachine.can_start() should check cmd and machine."""
    +        sm = ProcessMachine(self.process)
    +        self.assertTrue(sm.can_start())
    +
    +        self.process.cmd = []
    +        self.process.save()
    +        sm = ProcessMachine(self.process)
    +        self.assertFalse(sm.can_start())
    +
    +    def test_process_state_machine_is_exited(self):
    +        """ProcessMachine.is_exited() should check exit_code."""
    +        sm = ProcessMachine(self.process)
    +        self.assertFalse(sm.is_exited())
    +
    +        self.process.exit_code = 0
    +        self.process.save()
    +        sm = ProcessMachine(self.process)
    +        self.assertTrue(sm.is_exited())
    +
    +
    +if __name__ == "__main__":
    +    pytest.main([__file__, "-v"])
    diff --git a/archivebox/tests/test_migrations_04_to_09.py b/archivebox/tests/test_migrations_04_to_09.py
    new file mode 100644
    index 0000000000..195340960c
    --- /dev/null
    +++ b/archivebox/tests/test_migrations_04_to_09.py
    @@ -0,0 +1,182 @@
    +#!/usr/bin/env python3
    +"""
    +Migration tests from 0.4.x to 0.9.x.
    +
    +0.4.x was the first Django-powered version with a simpler schema:
    +- No Tag model (tags stored as comma-separated string in Snapshot)
    +- No ArchiveResult model (results stored in JSON files)
    +"""
    +
    +import shutil
    +import sqlite3
    +import tempfile
    +import unittest
    +from pathlib import Path
    +from typing import cast
    +
    +from .migrations_helpers import (
    +    SCHEMA_0_4,
    +    seed_0_4_data,
    +    run_archivebox,
    +    create_data_dir_structure,
    +    verify_snapshot_count,
    +    verify_snapshot_urls,
    +    verify_tag_count,
    +)
    +
    +
    +class TestMigrationFrom04x(unittest.TestCase):
    +    """Test migration from 0.4.x schema to latest."""
    +
    +    def setUp(self):
    +        """Create a temporary directory with 0.4.x schema and data."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +        self.db_path = self.work_dir / "index.sqlite3"
    +
    +        # Create directory structure
    +        create_data_dir_structure(self.work_dir)
    +
    +        # Create database with 0.4.x schema
    +        conn = sqlite3.connect(str(self.db_path))
    +        conn.executescript(SCHEMA_0_4)
    +        conn.close()
    +
    +        # Seed with test data
    +        self.original_data = seed_0_4_data(self.db_path)
    +
    +    def tearDown(self):
    +        """Clean up temporary directory."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_migration_preserves_snapshot_count(self):
    +        """Migration should preserve all snapshots from 0.4.x."""
    +        expected_count = len(self.original_data["snapshots"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_urls(self):
    +        """Migration should preserve all snapshot URLs from 0.4.x."""
    +        expected_urls = [s["url"] for s in self.original_data["snapshots"]]
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_converts_string_tags_to_model(self):
    +        """Migration should convert comma-separated tags to Tag model instances."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Collect unique tags from original data
    +        original_tags = set()
    +        for tags_str in cast(list[str], self.original_data["tags_str"]):
    +            if tags_str:
    +                for tag in tags_str.split(","):
    +                    original_tags.add(tag.strip())
    +
    +        # Tags should have been created
    +        ok, msg = verify_tag_count(self.db_path, len(original_tags))
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_titles(self):
    +        """Migration should preserve all snapshot titles."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT url, title FROM core_snapshot")
    +        actual = {row[0]: row[1] for row in cursor.fetchall()}
    +        conn.close()
    +
    +        for snapshot in self.original_data["snapshots"]:
    +            self.assertEqual(
    +                actual.get(snapshot["url"]),
    +                snapshot["title"],
    +                f"Title mismatch for {snapshot['url']}",
    +            )
    +
    +    def test_status_works_after_migration(self):
    +        """Status command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["status"])
    +        self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
    +
    +    def test_list_works_after_migration(self):
    +        """List command should work and show ALL migrated snapshots."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["list"])
    +        self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
    +
    +        # Verify ALL snapshots appear in output
    +        output = result.stdout + result.stderr
    +        for snapshot in self.original_data["snapshots"]:
    +            url_fragment = snapshot["url"][:30]
    +            self.assertIn(
    +                url_fragment,
    +                output,
    +                f"Snapshot {snapshot['url']} not found in list output",
    +            )
    +
    +    def test_add_works_after_migration(self):
    +        """Adding new URLs should work after migration from 0.4.x."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Try to add a new URL after migration
    +        result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
    +
    +        # Verify snapshot was added
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = 'https://example.com/new-page'")
    +        count = cursor.fetchone()[0]
    +        conn.close()
    +
    +        self.assertEqual(count, 1, "New snapshot was not created after migration")
    +
    +    def test_new_schema_elements_created(self):
    +        """Migration should create new 0.9.x schema elements."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    +        tables = {row[0] for row in cursor.fetchall()}
    +        conn.close()
    +
    +        # New tables should exist
    +        self.assertIn("crawls_crawl", tables, "crawls_crawl table not created")
    +        self.assertIn("core_tag", tables, "core_tag table not created")
    +        self.assertIn("core_archiveresult", tables, "core_archiveresult table not created")
    +
    +    def test_snapshots_have_new_fields(self):
    +        """Migrated snapshots should have new 0.9.x fields."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("PRAGMA table_info(core_snapshot)")
    +        columns = {row[1] for row in cursor.fetchall()}
    +        conn.close()
    +
    +        required_columns = {"status", "depth", "created_at", "modified_at"}
    +        for col in required_columns:
    +            self.assertIn(col, columns, f"Snapshot missing new column: {col}")
    +
    +
    +if __name__ == "__main__":
    +    unittest.main()
    diff --git a/archivebox/tests/test_migrations_07_to_09.py b/archivebox/tests/test_migrations_07_to_09.py
    new file mode 100644
    index 0000000000..cfb04b7203
    --- /dev/null
    +++ b/archivebox/tests/test_migrations_07_to_09.py
    @@ -0,0 +1,383 @@
    +#!/usr/bin/env python3
    +"""
    +Migration tests from 0.7.x to 0.9.x.
    +
    +0.7.x schema includes:
    +- Tag model with ManyToMany to Snapshot
    +- ArchiveResult model with ForeignKey to Snapshot
    +- AutoField primary keys
    +"""
    +
    +import shutil
    +import sqlite3
    +import tempfile
    +import unittest
    +from pathlib import Path
    +
    +from .migrations_helpers import (
    +    SCHEMA_0_7,
    +    seed_0_7_data,
    +    run_archivebox,
    +    create_data_dir_structure,
    +    verify_snapshot_count,
    +    verify_snapshot_urls,
    +    verify_snapshot_titles,
    +    verify_tag_count,
    +    verify_archiveresult_count,
    +    verify_foreign_keys,
    +    verify_all_snapshots_in_output,
    +)
    +
    +
    +class TestMigrationFrom07x(unittest.TestCase):
    +    """Test migration from 0.7.x schema to latest."""
    +
    +    def setUp(self):
    +        """Create a temporary directory with 0.7.x schema and data."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +        self.db_path = self.work_dir / "index.sqlite3"
    +
    +        # Create directory structure
    +        create_data_dir_structure(self.work_dir)
    +
    +        # Create database with 0.7.x schema
    +        conn = sqlite3.connect(str(self.db_path))
    +        conn.executescript(SCHEMA_0_7)
    +        conn.close()
    +
    +        # Seed with test data
    +        self.original_data = seed_0_7_data(self.db_path)
    +
    +    def tearDown(self):
    +        """Clean up temporary directory."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_migration_preserves_snapshot_count(self):
    +        """Migration should preserve all snapshots."""
    +        expected_count = len(self.original_data["snapshots"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_urls(self):
    +        """Migration should preserve all snapshot URLs."""
    +        expected_urls = [s["url"] for s in self.original_data["snapshots"]]
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_titles(self):
    +        """Migration should preserve all snapshot titles."""
    +        expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_tags(self):
    +        """Migration should preserve all tags."""
    +        expected_count = len(self.original_data["tags"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_tag_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_archiveresults(self):
    +        """Migration should preserve all archive results."""
    +        expected_count = len(self.original_data["archiveresults"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_archiveresult_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_foreign_keys(self):
    +        """Migration should maintain foreign key relationships."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_foreign_keys(self.db_path)
    +        self.assertTrue(ok, msg)
    +
    +    def test_status_works_after_migration(self):
    +        """Status command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["status"])
    +        self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
    +
    +    def test_search_works_after_migration(self):
    +        """Search command should find ALL migrated snapshots."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["search"])
    +        self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
    +
    +        # Verify ALL snapshots appear in output
    +        output = result.stdout + result.stderr
    +        ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
    +        self.assertTrue(ok, msg)
    +
    +    def test_list_works_after_migration(self):
    +        """List command should work and show ALL migrated data."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["snapshot", "list"])
    +        self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
    +
    +        # Verify ALL snapshots appear in output
    +        output = result.stdout + result.stderr
    +        ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
    +        self.assertTrue(ok, msg)
    +
    +    def test_new_schema_elements_created_after_migration(self):
    +        """Migration should create new 0.9.x schema elements (crawls_crawl, etc.)."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Check that new tables exist
    +        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    +        tables = {row[0] for row in cursor.fetchall()}
    +        conn.close()
    +
    +        # 0.9.x should have crawls_crawl table
    +        self.assertIn("crawls_crawl", tables, "crawls_crawl table not created during migration")
    +
    +    def test_snapshots_have_new_fields_after_migration(self):
    +        """Migrated snapshots should have new 0.9.x fields (status, depth, etc.)."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Check snapshot table has new columns
    +        cursor.execute("PRAGMA table_info(core_snapshot)")
    +        columns = {row[1] for row in cursor.fetchall()}
    +        conn.close()
    +
    +        # 0.9.x snapshots should have status, depth, created_at, modified_at
    +        required_new_columns = {"status", "depth", "created_at", "modified_at"}
    +        for col in required_new_columns:
    +            self.assertIn(col, columns, f"Snapshot missing new column: {col}")
    +
    +    def test_add_works_after_migration(self):
    +        """Adding new URLs should work after migration from 0.7.x."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Verify that init created the crawls_crawl table before proceeding
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'")
    +        table_exists = cursor.fetchone() is not None
    +        conn.close()
    +        self.assertTrue(table_exists, f"Init failed to create crawls_crawl table. Init stderr: {result.stderr[-500:]}")
    +
    +        # Try to add a new URL after migration (use --index-only for speed)
    +        result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
    +
    +        # Verify a Crawl was created for the new URL
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
    +        crawl_count = cursor.fetchone()[0]
    +        conn.close()
    +
    +        self.assertGreaterEqual(crawl_count, 1, f"No Crawl created when adding URL. Add stderr: {result.stderr[-500:]}")
    +
    +    def test_archiveresult_status_preserved_after_migration(self):
    +        """Migration should preserve archive result status values."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Get status counts
    +        cursor.execute("SELECT status, COUNT(*) FROM core_archiveresult GROUP BY status")
    +        status_counts = dict(cursor.fetchall())
    +        conn.close()
    +
    +        # Original data has known status distribution: succeeded, failed, skipped
    +        self.assertIn("succeeded", status_counts, "Should have succeeded results")
    +        self.assertIn("failed", status_counts, "Should have failed results")
    +        self.assertIn("skipped", status_counts, "Should have skipped results")
    +
    +    def test_version_works_after_migration(self):
    +        """Version command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["version"])
    +        self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
    +
    +        # Should show version info
    +        output = result.stdout + result.stderr
    +        self.assertTrue(
    +            "ArchiveBox" in output or "version" in output.lower(),
    +            f"Version output missing expected content: {output[:500]}",
    +        )
    +
    +    def test_help_works_after_migration(self):
    +        """Help command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["help"])
    +        self.assertEqual(result.returncode, 0, f"Help failed after migration: {result.stderr}")
    +
    +        # Should show available commands
    +        output = result.stdout + result.stderr
    +        self.assertTrue(
    +            "add" in output.lower() and "status" in output.lower(),
    +            f"Help output missing expected commands: {output[:500]}",
    +        )
    +
    +
    +class TestMigrationDataIntegrity07x(unittest.TestCase):
    +    """Comprehensive data integrity tests for 0.7.x migrations."""
    +
    +    def test_no_duplicate_snapshots_after_migration(self):
    +        """Migration should not create duplicate snapshots."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_7)
    +            conn.close()
    +            seed_0_7_data(db_path)
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Check for duplicate URLs
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("""
    +                SELECT url, COUNT(*) as cnt FROM core_snapshot
    +                GROUP BY url HAVING cnt > 1
    +            """)
    +            duplicates = cursor.fetchall()
    +            conn.close()
    +
    +            self.assertEqual(len(duplicates), 0, f"Found duplicate URLs: {duplicates}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_no_orphaned_archiveresults_after_migration(self):
    +        """Migration should not leave orphaned ArchiveResults."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_7)
    +            conn.close()
    +            seed_0_7_data(db_path)
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            ok, msg = verify_foreign_keys(db_path)
    +            self.assertTrue(ok, msg)
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_timestamps_preserved_after_migration(self):
    +        """Migration should preserve original timestamps."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_7)
    +            conn.close()
    +            original_data = seed_0_7_data(db_path)
    +
    +            original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT url, timestamp FROM core_snapshot")
    +            migrated_timestamps = {row[0]: row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            for url, original_ts in original_timestamps.items():
    +                self.assertEqual(
    +                    migrated_timestamps.get(url),
    +                    original_ts,
    +                    f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
    +                )
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_tag_associations_preserved_after_migration(self):
    +        """Migration should preserve snapshot-tag associations."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_7)
    +            conn.close()
    +            seed_0_7_data(db_path)
    +
    +            # Count tag associations before migration
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
    +            original_count = cursor.fetchone()[0]
    +            conn.close()
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Count tag associations after migration
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
    +            migrated_count = cursor.fetchone()[0]
    +            conn.close()
    +
    +            self.assertEqual(
    +                migrated_count,
    +                original_count,
    +                f"Tag associations changed: {original_count} -> {migrated_count}",
    +            )
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +
    +if __name__ == "__main__":
    +    unittest.main()
    diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py
    new file mode 100644
    index 0000000000..7264a06d4e
    --- /dev/null
    +++ b/archivebox/tests/test_migrations_08_to_09.py
    @@ -0,0 +1,798 @@
    +#!/usr/bin/env python3
    +"""
    +Migration tests from 0.8.x to 0.9.x.
    +
    +0.8.x introduced:
    +- Crawl model for grouping URLs
    +- Seed model (removed in 0.9.x)
    +- UUID primary keys for Snapshot
    +- Status fields for state machine
    +- New fields like depth, retry_at, etc.
    +"""
    +
    +import shutil
    +import sqlite3
    +import tempfile
    +import unittest
    +import json
    +from pathlib import Path
    +
    +from .migrations_helpers import (
    +    SCHEMA_0_7,
    +    SCHEMA_0_8,
    +    seed_0_8_data,
    +    seed_0_7_data,
    +    run_archivebox,
    +    create_data_dir_structure,
    +    verify_snapshot_count,
    +    verify_snapshot_urls,
    +    verify_snapshot_titles,
    +    verify_tag_count,
    +    verify_archiveresult_count,
    +    verify_foreign_keys,
    +    verify_all_snapshots_in_output,
    +    verify_crawl_count,
    +    verify_process_migration,
    +)
    +
    +
    +class TestMigrationFrom08x(unittest.TestCase):
    +    """Test migration from 0.8.x schema to latest."""
    +
    +    def setUp(self):
    +        """Create a temporary directory with 0.8.x schema and data."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +        self.db_path = self.work_dir / "index.sqlite3"
    +
    +        # Create directory structure
    +        create_data_dir_structure(self.work_dir)
    +
    +        # Create database with 0.8.x schema
    +        conn = sqlite3.connect(str(self.db_path))
    +        conn.executescript(SCHEMA_0_8)
    +        conn.close()
    +
    +        # Seed with test data
    +        self.original_data = seed_0_8_data(self.db_path)
    +
    +    def tearDown(self):
    +        """Clean up temporary directory."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_migration_preserves_snapshot_count(self):
    +        """Migration should preserve all snapshots from 0.8.x."""
    +        expected_count = len(self.original_data["snapshots"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_urls(self):
    +        """Migration should preserve all snapshot URLs from 0.8.x."""
    +        expected_urls = [s["url"] for s in self.original_data["snapshots"]]
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_crawls(self):
    +        """Migration should preserve all Crawl records and create default crawl if needed."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Count snapshots with NULL crawl_id in original data
    +        snapshots_without_crawl = sum(1 for s in self.original_data["snapshots"] if s["crawl_id"] is None)
    +
    +        # Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
    +        expected_count = len(self.original_data["crawls"])
    +        if snapshots_without_crawl > 0:
    +            expected_count += 1  # Migration 0024 creates a default crawl
    +
    +        ok, msg = verify_crawl_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_crawl_links(self):
    +        """Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Check EVERY snapshot has a crawl_id after migration
    +        for snapshot in self.original_data["snapshots"]:
    +            cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot["url"],))
    +            row = cursor.fetchone()
    +            self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
    +
    +            if snapshot["crawl_id"] is not None:
    +                # Snapshots that had a crawl should keep it
    +                self.assertEqual(
    +                    row[0],
    +                    snapshot["crawl_id"],
    +                    f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}",
    +                )
    +            else:
    +                # Snapshots without a crawl should now have one (the default crawl)
    +                self.assertIsNotNone(
    +                    row[0],
    +                    f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL",
    +                )
    +
    +        conn.close()
    +
    +    def test_migration_preserves_tags(self):
    +        """Migration should preserve all tags."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_tag_count(self.db_path, len(self.original_data["tags"]))
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_archiveresults(self):
    +        """Migration should preserve all archive results."""
    +        expected_count = len(self.original_data["archiveresults"])
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_archiveresult_count(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_archiveresult_status(self):
    +        """Migration should preserve archive result status values."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Get status counts
    +        cursor.execute("SELECT status, COUNT(*) FROM core_archiveresult GROUP BY status")
    +        status_counts = dict(cursor.fetchall())
    +        conn.close()
    +
    +        # Original data has known status distribution: succeeded, failed, skipped
    +        self.assertIn("succeeded", status_counts, "Should have succeeded results")
    +        self.assertIn("failed", status_counts, "Should have failed results")
    +        self.assertIn("skipped", status_counts, "Should have skipped results")
    +
    +    def test_status_works_after_migration(self):
    +        """Status command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["status"])
    +        self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
    +
    +    def test_list_works_after_migration(self):
    +        """List command should work and show ALL migrated data."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["snapshot", "list"])
    +        self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
    +
    +        # Verify ALL snapshots appear in output
    +        output = result.stdout + result.stderr
    +        ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
    +        self.assertTrue(ok, msg)
    +
    +    def test_search_works_after_migration(self):
    +        """Search command should find ALL migrated snapshots."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["search"])
    +        self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
    +
    +        # Verify ALL snapshots appear in output
    +        output = result.stdout + result.stderr
    +        ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_snapshot_titles(self):
    +        """Migration should preserve all snapshot titles."""
    +        expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
    +
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_preserves_foreign_keys(self):
    +        """Migration should maintain foreign key relationships."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        ok, msg = verify_foreign_keys(self.db_path)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_removes_seed_id_column(self):
    +        """Migration should remove seed_id column from archivebox.crawls.crawl."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("PRAGMA table_info(crawls_crawl)")
    +        columns = [row[1] for row in cursor.fetchall()]
    +        conn.close()
    +
    +        self.assertNotIn(
    +            "seed_id",
    +            columns,
    +            f"seed_id column should have been removed by migration. Columns: {columns}",
    +        )
    +
    +    def test_migration_removes_seed_table(self):
    +        """Migration should remove crawls_seed table."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_seed'")
    +        table_exists = cursor.fetchone() is not None
    +        conn.close()
    +
    +        self.assertFalse(table_exists, "crawls_seed table should have been removed by migration")
    +
    +    def test_add_works_after_migration(self):
    +        """Adding new URLs should work after migration from 0.8.x."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        # Check that init actually ran and applied migrations
    +        self.assertIn(
    +            "Applying",
    +            result.stdout + result.stderr,
    +            f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}",
    +        )
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Count existing crawls
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
    +        initial_crawl_count = cursor.fetchone()[0]
    +        conn.close()
    +
    +        # Try to add a new URL after migration (use --index-only for speed)
    +        result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
    +
    +        # Verify a new Crawl was created
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
    +        new_crawl_count = cursor.fetchone()[0]
    +        conn.close()
    +
    +        self.assertGreater(
    +            new_crawl_count,
    +            initial_crawl_count,
    +            f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}",
    +        )
    +
    +    def test_version_works_after_migration(self):
    +        """Version command should work after migration."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        result = run_archivebox(self.work_dir, ["version"])
    +        self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
    +
    +        # Should show version info
    +        output = result.stdout + result.stderr
    +        self.assertTrue(
    +            "ArchiveBox" in output or "version" in output.lower(),
    +            f"Version output missing expected content: {output[:500]}",
    +        )
    +
    +    def test_migration_creates_process_records(self):
    +        """Migration should create Process records for all ArchiveResults."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        # Verify Process records created
    +        expected_count = len(self.original_data["archiveresults"])
    +        ok, msg = verify_process_migration(self.db_path, expected_count)
    +        self.assertTrue(ok, msg)
    +
    +    def test_migration_creates_binary_records(self):
    +        """Migration should create Binary records from cmd_version data."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Check Binary records exist
    +        cursor.execute("SELECT COUNT(*) FROM machine_binary")
    +        binary_count = cursor.fetchone()[0]
    +
    +        # Should have at least one binary per unique extractor
    +        extractors = {ar["extractor"] for ar in self.original_data["archiveresults"]}
    +        self.assertGreaterEqual(
    +            binary_count,
    +            len(extractors),
    +            f"Expected at least {len(extractors)} Binaries, got {binary_count}",
    +        )
    +
    +        conn.close()
    +
    +    def test_migration_preserves_cmd_data(self):
    +        """Migration should preserve cmd data in Process.cmd field."""
    +        result = run_archivebox(self.work_dir, ["init"], timeout=45)
    +        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        # Check that Process records have cmd arrays
    +        cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
    +        cmd_records = cursor.fetchall()
    +
    +        # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
    +        expected_count = len(self.original_data["archiveresults"])
    +        self.assertEqual(
    +            len(cmd_records),
    +            expected_count,
    +            f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}",
    +        )
    +
    +        conn.close()
    +
    +
    +class TestMigrationDataIntegrity08x(unittest.TestCase):
    +    """Comprehensive data integrity tests for 0.8.x migrations."""
    +
    +    def test_no_duplicate_snapshots_after_migration(self):
    +        """Migration should not create duplicate snapshots."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_8)
    +            conn.close()
    +            seed_0_8_data(db_path)
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Check for duplicate URLs
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("""
    +                SELECT url, COUNT(*) as cnt FROM core_snapshot
    +                GROUP BY url HAVING cnt > 1
    +            """)
    +            duplicates = cursor.fetchall()
    +            conn.close()
    +
    +            self.assertEqual(len(duplicates), 0, f"Found duplicate URLs: {duplicates}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_no_orphaned_archiveresults_after_migration(self):
    +        """Migration should not leave orphaned ArchiveResults."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_8)
    +            conn.close()
    +            seed_0_8_data(db_path)
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            ok, msg = verify_foreign_keys(db_path)
    +            self.assertTrue(ok, msg)
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_timestamps_preserved_after_migration(self):
    +        """Migration should preserve original timestamps."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_8)
    +            conn.close()
    +            original_data = seed_0_8_data(db_path)
    +
    +            original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT url, timestamp FROM core_snapshot")
    +            migrated_timestamps = {row[0]: row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            for url, original_ts in original_timestamps.items():
    +                self.assertEqual(
    +                    migrated_timestamps.get(url),
    +                    original_ts,
    +                    f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
    +                )
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_crawl_data_preserved_after_migration(self):
    +        """Migration should preserve crawl metadata (urls, label, status)."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_8)
    +            conn.close()
    +            original_data = seed_0_8_data(db_path)
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +
    +            # Check each crawl's data is preserved
    +            for crawl in original_data["crawls"]:
    +                cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl["id"],))
    +                row = cursor.fetchone()
    +                self.assertIsNotNone(row, f"Crawl {crawl['id']} not found after migration")
    +                self.assertEqual(row[0], crawl["urls"], f"URLs mismatch for crawl {crawl['id']}")
    +                self.assertEqual(row[1], crawl["label"], f"Label mismatch for crawl {crawl['id']}")
    +
    +            conn.close()
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_tag_associations_preserved_after_migration(self):
    +        """Migration should preserve snapshot-tag associations."""
    +        work_dir = Path(tempfile.mkdtemp())
    +        db_path = work_dir / "index.sqlite3"
    +
    +        try:
    +            create_data_dir_structure(work_dir)
    +            conn = sqlite3.connect(str(db_path))
    +            conn.executescript(SCHEMA_0_8)
    +            conn.close()
    +            seed_0_8_data(db_path)
    +
    +            # Count tag associations before migration
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
    +            original_count = cursor.fetchone()[0]
    +            conn.close()
    +
    +            result = run_archivebox(work_dir, ["init"], timeout=45)
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Count tag associations after migration
    +            conn = sqlite3.connect(str(db_path))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags")
    +            migrated_count = cursor.fetchone()[0]
    +            conn.close()
    +
    +            self.assertEqual(
    +                migrated_count,
    +                original_count,
    +                f"Tag associations changed: {original_count} -> {migrated_count}",
    +            )
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +
    +class TestFilesystemMigration08to09(unittest.TestCase):
    +    """Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
    +
    +    def setUp(self):
    +        """Create a temporary directory for testing."""
    +        self.work_dir = Path(tempfile.mkdtemp())
    +        self.db_path = self.work_dir / "index.sqlite3"
    +
    +    def tearDown(self):
    +        """Clean up temporary directory."""
    +        shutil.rmtree(self.work_dir, ignore_errors=True)
    +
    +    def test_archiveresult_files_preserved_after_migration(self):
    +        """
    +        Test that ArchiveResult output files are reorganized into new structure.
    +
    +        This test verifies that:
    +        1. Migration preserves ArchiveResult data in Process/Binary records
    +        2. Running `archivebox update` reorganizes files into new structure
    +        3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
    +        4. All files are moved (no data loss)
    +        5. Old archive/timestamp/ directories are cleaned up
    +        """
    +        create_data_dir_structure(self.work_dir)
    +        conn = sqlite3.connect(str(self.db_path))
    +        conn.executescript(SCHEMA_0_7)
    +        conn.close()
    +        original_data = seed_0_7_data(self.db_path)
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +        for i, snapshot in enumerate(original_data["snapshots"]):
    +            legacy_timestamp = str(1704110400 + (i * 86400))
    +            cursor.execute(
    +                "UPDATE core_snapshot SET timestamp = ? WHERE id = ?",
    +                (legacy_timestamp, snapshot["id"]),
    +            )
    +            cursor.execute(
    +                "UPDATE core_archiveresult SET pwd = ? WHERE snapshot_id = ?",
    +                (f"/data/archive/{legacy_timestamp}", snapshot["id"]),
    +            )
    +            snapshot["timestamp"] = legacy_timestamp
    +        conn.commit()
    +        conn.close()
    +
    +        sample_files = [
    +            "favicon.ico",
    +            "screenshot.png",
    +            "singlefile.html",
    +            "headers.json",
    +        ]
    +        for snapshot in original_data["snapshots"]:
    +            snapshot_dir = self.work_dir / "archive" / snapshot["timestamp"]
    +            snapshot_dir.mkdir(parents=True, exist_ok=True)
    +            (snapshot_dir / "index.json").write_text(
    +                json.dumps(
    +                    {
    +                        "url": snapshot["url"],
    +                        "timestamp": snapshot["timestamp"],
    +                        "title": snapshot["title"],
    +                    },
    +                ),
    +            )
    +            for sample_file in sample_files:
    +                (snapshot_dir / sample_file).write_text(f"{snapshot['url']}::{sample_file}")
    +
    +        # Count archive directories and files BEFORE migration
    +        archive_dir = self.work_dir / "archive"
    +        dirs_before = list(archive_dir.glob("*")) if archive_dir.exists() else []
    +        dirs_before_count = len([d for d in dirs_before if d.is_dir()])
    +
    +        # Count total files in all archive directories
    +        files_before = []
    +        for d in dirs_before:
    +            if d.is_dir():
    +                files_before.extend([f for f in d.rglob("*") if f.is_file()])
    +        files_before_count = len(files_before)
    +
    +        # Sample some specific files to check they're preserved
    +        sample_paths_before = {}
    +        for d in dirs_before:
    +            if d.is_dir():
    +                for sample_file in sample_files:
    +                    matching = list(d.glob(sample_file))
    +                    if matching:
    +                        sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]
    +
    +        print(f"\n[*] Archive directories before migration: {dirs_before_count}")
    +        print(f"[*] Total files before migration: {files_before_count}")
    +        print(f"[*] Sample files found: {len(sample_paths_before)}")
    +
    +        # Run init to trigger migration
    +        result = run_archivebox(self.work_dir, ["init"], timeout=60)
    +        self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
    +
    +        # Count archive directories and files AFTER migration
    +        dirs_after = list(archive_dir.glob("*")) if archive_dir.exists() else []
    +        dirs_after_count = len([d for d in dirs_after if d.is_dir()])
    +
    +        files_after = []
    +        for d in dirs_after:
    +            if d.is_dir():
    +                files_after.extend([f for f in d.rglob("*") if f.is_file()])
    +        files_after_count = len(files_after)
    +
    +        # Verify sample files still exist
    +        sample_paths_after = {}
    +        for d in dirs_after:
    +            if d.is_dir():
    +                for sample_file in sample_files:
    +                    matching = list(d.glob(sample_file))
    +                    if matching:
    +                        sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]
    +
    +        print(f"[*] Archive directories after migration: {dirs_after_count}")
    +        print(f"[*] Total files after migration: {files_after_count}")
    +        print(f"[*] Sample files found: {len(sample_paths_after)}")
    +
    +        # Verify files still in old structure after migration (not moved yet)
    +        self.assertEqual(
    +            dirs_before_count,
    +            dirs_after_count,
    +            f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}",
    +        )
    +        self.assertEqual(
    +            files_before_count,
    +            files_after_count,
    +            f"Files lost during migration: {files_before_count} -> {files_after_count}",
    +        )
    +
    +        # Run update to trigger filesystem reorganization
    +        print("\n[*] Running archivebox update to reorganize filesystem...")
    +        result = run_archivebox(self.work_dir, ["update"], timeout=120)
    +        self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
    +
    +        # Check new filesystem structure
    +        # New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
    +        users_dir = self.work_dir / "users"
    +        snapshots_base = None
    +
    +        if users_dir.exists():
    +            # Find the snapshots directory
    +            for user_dir in users_dir.iterdir():
    +                if user_dir.is_dir():
    +                    user_snapshots = user_dir / "snapshots"
    +                    if user_snapshots.exists():
    +                        snapshots_base = user_snapshots
    +                        break
    +
    +        print(f"[*] New structure base: {snapshots_base}")
    +
    +        # Count files in new structure
    +        # Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
    +        files_new_structure = []
    +        new_sample_files = {}
    +
    +        if snapshots_base and snapshots_base.exists():
    +            for date_dir in snapshots_base.iterdir():
    +                if date_dir.is_dir():
    +                    for domain_dir in date_dir.iterdir():
    +                        if domain_dir.is_dir():
    +                            for snap_dir in domain_dir.iterdir():
    +                                if snap_dir.is_dir():
    +                                    # Files are directly in snap-uuid/ directory (no plugin subdirs)
    +                                    for f in snap_dir.rglob("*"):
    +                                        if f.is_file():
    +                                            files_new_structure.append(f)
    +                                            # Track sample files
    +                                            if f.name in sample_files:
    +                                                new_sample_files[f"{snap_dir.name}/{f.name}"] = f
    +
    +        files_new_count = len(files_new_structure)
    +        print(f"[*] Files in new structure: {files_new_count}")
    +        print(f"[*] Sample files in new structure: {len(new_sample_files)}")
    +
    +        # Check old structure (should be gone or empty)
    +        old_archive_dir = self.work_dir / "archive"
    +        old_files_remaining = []
    +        unmigrated_dirs = []
    +        if old_archive_dir.exists():
    +            for d in old_archive_dir.glob("*"):
    +                # Only count REAL directories, not symlinks (symlinks are the migrated ones)
    +                if d.is_dir(follow_symlinks=False) and d.name.replace(".", "").isdigit():
    +                    # This is a timestamp directory (old structure)
    +                    files_in_dir = [f for f in d.rglob("*") if f.is_file()]
    +                    if files_in_dir:
    +                        unmigrated_dirs.append((d.name, len(files_in_dir)))
    +                        old_files_remaining.extend(files_in_dir)
    +
    +        old_files_count = len(old_files_remaining)
    +        print(f"[*] Files remaining in old structure: {old_files_count}")
    +        if unmigrated_dirs:
    +            print(f"[*] Unmigrated directories: {unmigrated_dirs}")
    +
    +        # CRITICAL: Verify files were moved to new structure
    +        self.assertGreater(
    +            files_new_count,
    +            0,
    +            "No files found in new structure after update",
    +        )
    +
    +        # CRITICAL: Verify old structure is cleaned up
    +        self.assertEqual(
    +            old_files_count,
    +            0,
    +            f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories",
    +        )
    +
    +        # CRITICAL: Verify all files were moved (total count should match)
    +        total_after_update = files_new_count + old_files_count
    +        self.assertEqual(
    +            files_before_count,
    +            total_after_update,
    +            f"Files lost during reorganization: {files_before_count} before → {total_after_update} after",
    +        )
    +
    +        # CRITICAL: Verify sample files exist in new structure
    +        self.assertGreater(
    +            len(new_sample_files),
    +            0,
    +            "Sample files not found in new structure",
    +        )
    +
    +        # Verify new path format
    +        for path_key, file_path in new_sample_files.items():
    +            # Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
    +            path_parts = file_path.parts
    +            self.assertIn(
    +                "snapshots",
    +                path_parts,
    +                f"New path should contain 'snapshots': {file_path}",
    +            )
    +            self.assertIn(
    +                "users",
    +                path_parts,
    +                f"New path should contain 'users': {file_path}",
    +            )
    +            print(f"    ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
    +
    +        # Verify Process and Binary records were created
    +        conn = sqlite3.connect(str(self.db_path))
    +        cursor = conn.cursor()
    +
    +        cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
    +        archiveresult_count = cursor.fetchone()[0]
    +
    +        cursor.execute("SELECT COUNT(*) FROM machine_process")
    +        process_count = cursor.fetchone()[0]
    +
    +        cursor.execute("SELECT COUNT(*) FROM machine_binary")
    +        binary_count = cursor.fetchone()[0]
    +
    +        cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
    +        linked_count = cursor.fetchone()[0]
    +
    +        conn.close()
    +
    +        print(f"[*] ArchiveResults: {archiveresult_count}")
    +        print(f"[*] Process records created: {process_count}")
    +        print(f"[*] Binary records created: {binary_count}")
    +        print(f"[*] ArchiveResults linked to Process: {linked_count}")
    +
    +        # Verify data migration happened correctly
    +        self.assertEqual(
    +            archiveresult_count,
    +            len(original_data["archiveresults"]),
    +            f"Expected {len(original_data['archiveresults'])} ArchiveResults after migration, got {archiveresult_count}",
    +        )
    +
    +        # Each ArchiveResult should create one Process record
    +        self.assertEqual(
    +            process_count,
    +            len(original_data["archiveresults"]),
    +            f"Expected {len(original_data['archiveresults'])} Process records (1 per ArchiveResult), got {process_count}",
    +        )
    +
    +        self.assertEqual(
    +            binary_count,
    +            5,
    +            f"Expected 5 unique Binary records, got {binary_count}",
    +        )
    +
    +        # ALL ArchiveResults should be linked to Process records
    +        self.assertEqual(
    +            linked_count,
    +            len(original_data["archiveresults"]),
    +            f"Expected all {len(original_data['archiveresults'])} ArchiveResults linked to Process, got {linked_count}",
    +        )
    +
    +
    +if __name__ == "__main__":
    +    unittest.main()
    diff --git a/archivebox/tests/test_migrations_fresh.py b/archivebox/tests/test_migrations_fresh.py
    new file mode 100644
    index 0000000000..8c1eed4d40
    --- /dev/null
    +++ b/archivebox/tests/test_migrations_fresh.py
    @@ -0,0 +1,295 @@
    +#!/usr/bin/env python3
    +"""
    +Fresh install tests for ArchiveBox.
    +
    +Tests that fresh installations work correctly with the current schema.
    +"""
    +
    +import shutil
    +import sqlite3
    +import tempfile
    +import unittest
    +from pathlib import Path
    +
    +from .migrations_helpers import run_archivebox
    +
    +
    +class TestFreshInstall(unittest.TestCase):
    +    """Test that fresh installs work correctly."""
    +
    +    def test_init_creates_database(self):
    +        """Fresh init should create database and directories."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Verify database was created
    +            self.assertTrue((work_dir / "index.sqlite3").exists(), "Database not created")
    +            # Verify archive directory exists
    +            self.assertTrue((work_dir / "archive").is_dir(), "Archive dir not created")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_status_after_init(self):
    +        """Status command should work after init."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            result = run_archivebox(work_dir, ["status"])
    +            self.assertEqual(result.returncode, 0, f"Status failed: {result.stderr}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_add_url_after_init(self):
    +        """Should be able to add URLs after init with --index-only."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Add a URL with --index-only for speed
    +            result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
    +            self.assertEqual(result.returncode, 0, f"Add command failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +
    +            # Verify a Crawl was created
    +            cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
    +            crawl_count = cursor.fetchone()[0]
    +            self.assertGreaterEqual(crawl_count, 1, "No Crawl was created")
    +
    +            # Verify at least one snapshot was created
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot")
    +            snapshot_count = cursor.fetchone()[0]
    +            self.assertGreaterEqual(snapshot_count, 1, "No Snapshot was created")
    +
    +            conn.close()
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_list_after_add(self):
    +        """List command should show added snapshots."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
    +            self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
    +
    +            result = run_archivebox(work_dir, ["list"])
    +            self.assertEqual(result.returncode, 0, f"List failed: {result.stderr}")
    +
    +            # Verify the URL appears in output
    +            output = result.stdout + result.stderr
    +            self.assertIn("example.com", output, f"Added URL not in list output: {output[:500]}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_migrations_table_populated(self):
    +        """Django migrations table should be populated after init."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT COUNT(*) FROM django_migrations")
    +            count = cursor.fetchone()[0]
    +            conn.close()
    +
    +            # Should have many migrations applied
    +            self.assertGreater(count, 10, f"Expected >10 migrations, got {count}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_core_migrations_applied(self):
    +        """Core app migrations should be applied."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("SELECT name FROM django_migrations WHERE app='core' ORDER BY name")
    +            migrations = [row[0] for row in cursor.fetchall()]
    +            conn.close()
    +
    +            self.assertIn("0001_initial", migrations)
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +
    +class TestSchemaIntegrity(unittest.TestCase):
    +    """Test that the database schema is correct."""
    +
    +    def test_snapshot_table_has_required_columns(self):
    +        """Snapshot table should have all required columns."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("PRAGMA table_info(core_snapshot)")
    +            columns = {row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            required = {"id", "url", "timestamp", "title", "status", "created_at", "modified_at"}
    +            for col in required:
    +                self.assertIn(col, columns, f"Missing column: {col}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_archiveresult_table_has_required_columns(self):
    +        """ArchiveResult table should have all required columns."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("PRAGMA table_info(core_archiveresult)")
    +            columns = {row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            required = {"id", "snapshot_id", "plugin", "status", "created_at", "modified_at"}
    +            for col in required:
    +                self.assertIn(col, columns, f"Missing column: {col}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_tag_table_has_required_columns(self):
    +        """Tag table should have all required columns."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("PRAGMA table_info(core_tag)")
    +            columns = {row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            required = {"id", "name", "slug"}
    +            for col in required:
    +                self.assertIn(col, columns, f"Missing column: {col}")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_crawl_table_has_required_columns(self):
    +        """Crawl table should have all required columns."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +            cursor.execute("PRAGMA table_info(crawls_crawl)")
    +            columns = {row[1] for row in cursor.fetchall()}
    +            conn.close()
    +
    +            required = {"id", "urls", "status", "created_at", "created_by_id"}
    +            for col in required:
    +                self.assertIn(col, columns, f"Missing column: {col}")
    +
    +            # seed_id should NOT exist (removed in 0.9.x)
    +            self.assertNotIn("seed_id", columns, "seed_id column should not exist in 0.9.x")
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +
    +class TestMultipleSnapshots(unittest.TestCase):
    +    """Test handling multiple snapshots."""
    +
    +    def test_add_urls_separately(self):
    +        """Should be able to add multiple URLs one at a time."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            # Add URLs one at a time
    +            result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
    +            self.assertEqual(result.returncode, 0, f"Add 1 failed: {result.stderr}")
    +
    +            result = run_archivebox(work_dir, ["add", "--index-only", "https://example.org"])
    +            self.assertEqual(result.returncode, 0, f"Add 2 failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +
    +            # Verify snapshots were created
    +            cursor.execute("SELECT COUNT(*) FROM core_snapshot")
    +            snapshot_count = cursor.fetchone()[0]
    +            self.assertEqual(snapshot_count, 2, f"Expected 2 snapshots, got {snapshot_count}")
    +
    +            # Verify crawls were created (one per add call)
    +            cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
    +            crawl_count = cursor.fetchone()[0]
    +            self.assertEqual(crawl_count, 2, f"Expected 2 Crawls, got {crawl_count}")
    +
    +            conn.close()
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +    def test_snapshots_linked_to_crawls(self):
    +        """Each snapshot should be linked to a crawl."""
    +        work_dir = Path(tempfile.mkdtemp())
    +
    +        try:
    +            result = run_archivebox(work_dir, ["init"])
    +            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
    +
    +            result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
    +            self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
    +
    +            conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
    +            cursor = conn.cursor()
    +
    +            # Check that snapshot has a crawl_id
    +            cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = 'https://example.com'")
    +            row = cursor.fetchone()
    +            self.assertIsNotNone(row, "Snapshot not found")
    +            self.assertIsNotNone(row[0], "Snapshot should have a crawl_id")
    +
    +            conn.close()
    +
    +        finally:
    +            shutil.rmtree(work_dir, ignore_errors=True)
    +
    +
    +if __name__ == "__main__":
    +    unittest.main()
    diff --git a/archivebox/tests/test_persona_admin.py b/archivebox/tests/test_persona_admin.py
    new file mode 100644
    index 0000000000..f209682ae3
    --- /dev/null
    +++ b/archivebox/tests/test_persona_admin.py
    @@ -0,0 +1,191 @@
    +import pytest
    +from typing import cast
    +
    +from django.contrib.auth import get_user_model
    +from django.contrib.auth.models import UserManager
    +from django.urls import reverse
    +
    +from archivebox.personas.importers import (
    +    PersonaImportResult,
    +    discover_persona_template_profiles,
    +    import_persona_from_source,
    +    resolve_browser_profile_source,
    +    resolve_custom_import_source,
    +)
    +
    +
    +pytestmark = pytest.mark.django_db
    +
    +User = get_user_model()
    +ADMIN_HOST = "admin.archivebox.localhost:8000"
    +
    +
    +@pytest.fixture
    +def admin_user(db):
    +    return cast(UserManager, User.objects).create_superuser(
    +        username="personaadmin",
    +        email="personaadmin@test.com",
    +        password="testpassword",
    +    )
    +
    +
    +def _make_profile_source(tmp_path):
    +    user_data_dir = tmp_path / "Chrome User Data"
    +    profile_dir = user_data_dir / "Default"
    +    profile_dir.mkdir(parents=True)
    +    (profile_dir / "Preferences").write_text("{}")
    +    return resolve_browser_profile_source(
    +        browser="chrome",
    +        user_data_dir=user_data_dir,
    +        profile_dir="Default",
    +        browser_binary="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
    +    )
    +
    +
    +def test_resolve_custom_import_source_accepts_exact_profile_dir(tmp_path):
    +    user_data_dir = tmp_path / "Brave User Data"
    +    profile_dir = user_data_dir / "Profile 2"
    +    profile_dir.mkdir(parents=True)
    +    (profile_dir / "Preferences").write_text("{}")
    +
    +    source = resolve_custom_import_source(str(profile_dir))
    +
    +    assert source.kind == "browser-profile"
    +    assert source.user_data_dir == user_data_dir.resolve()
    +    assert source.profile_dir == "Profile 2"
    +
    +
    +def test_resolve_custom_import_source_accepts_cdp_url():
    +    source = resolve_custom_import_source("ws://127.0.0.1:9222/devtools/browser/test-session")
    +
    +    assert source.kind == "cdp"
    +    assert source.cdp_url == "ws://127.0.0.1:9222/devtools/browser/test-session"
    +
    +
    +def test_discover_persona_template_profiles_finds_chrome_profile_dirs(tmp_path):
    +    personas_dir = tmp_path / "personas"
    +    chrome_profile = personas_dir / "ExistingPersona" / "chrome_profile"
    +    default_profile = chrome_profile / "Default"
    +    default_profile.mkdir(parents=True)
    +    (default_profile / "Preferences").write_text("{}")
    +
    +    discovered = discover_persona_template_profiles(personas_dir=personas_dir)
    +
    +    assert len(discovered) == 1
    +    assert discovered[0].browser == "persona"
    +    assert discovered[0].source_name == "ExistingPersona"
    +    assert discovered[0].profile_dir == "Default"
    +    assert discovered[0].user_data_dir == chrome_profile.resolve()
    +
    +
    +def test_discover_persona_template_profiles_finds_home_abx_personas(monkeypatch, tmp_path):
    +    from archivebox.config.constants import CONSTANTS
    +
    +    monkeypatch.setattr(CONSTANTS, "PERSONAS_DIR", tmp_path / "missing-data-personas")
    +    monkeypatch.setattr("archivebox.personas.importers.Path.home", lambda: tmp_path)
    +
    +    chrome_profile = tmp_path / ".config" / "abx" / "personas" / "HomePersona" / "chrome_profile"
    +    default_profile = chrome_profile / "Default"
    +    default_profile.mkdir(parents=True)
    +    (default_profile / "Preferences").write_text("{}")
    +
    +    discovered = discover_persona_template_profiles()
    +
    +    assert len(discovered) == 1
    +    assert discovered[0].browser == "persona"
    +    assert discovered[0].source_name == "HomePersona"
    +    assert discovered[0].profile_dir == "Default"
    +    assert discovered[0].user_data_dir == chrome_profile.resolve()
    +
    +
    +def test_persona_admin_add_view_renders_import_ui(client, admin_user, monkeypatch, tmp_path):
    +    source = _make_profile_source(tmp_path)
    +    monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
    +    monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
    +
    +    client.login(username="personaadmin", password="testpassword")
    +    response = client.get(reverse("admin:personas_persona_add"), HTTP_HOST=ADMIN_HOST)
    +
    +    assert response.status_code == 200
    +    assert b"Bootstrap a persona from a real browser session" in response.content
    +    assert b"Google Chrome / Default" in response.content
    +    assert b"auth.json" in response.content
    +
    +
    +def test_import_persona_from_source_copies_user_agent_to_persona_config(admin_user, monkeypatch, tmp_path):
    +    from archivebox.personas.models import Persona
    +
    +    source = _make_profile_source(tmp_path)
    +    persona = Persona.objects.create(name="AgentPersona", created_by=admin_user)
    +
    +    def fake_export_browser_state(**kwargs):
    +        return True, {"user_agent": "Mozilla/5.0 Test Imported UA"}, "ok"
    +
    +    monkeypatch.setattr("archivebox.personas.importers.export_browser_state", fake_export_browser_state)
    +
    +    result = import_persona_from_source(
    +        persona,
    +        source,
    +        copy_profile=False,
    +        import_cookies=False,
    +        capture_storage=False,
    +    )
    +
    +    persona.refresh_from_db()
    +    assert result.user_agent_imported is True
    +    assert persona.config["USER_AGENT"] == "Mozilla/5.0 Test Imported UA"
    +
    +
    +def test_persona_admin_add_post_runs_shared_importer(client, admin_user, monkeypatch, tmp_path):
    +    from archivebox.personas.models import Persona
    +
    +    source = _make_profile_source(tmp_path)
    +    monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
    +    monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
    +
    +    calls = {}
    +
    +    def fake_import(persona, selected_source, **kwargs):
    +        calls["persona_name"] = persona.name
    +        calls["source"] = selected_source
    +        calls["kwargs"] = kwargs
    +        (persona.path / "cookies.txt").parent.mkdir(parents=True, exist_ok=True)
    +        (persona.path / "cookies.txt").write_text("# Netscape HTTP Cookie File\n")
    +        (persona.path / "auth.json").write_text('{"TYPE":"auth","cookies":[],"localStorage":{},"sessionStorage":{}}\n')
    +        return PersonaImportResult(
    +            source=selected_source,
    +            profile_copied=True,
    +            cookies_imported=True,
    +            storage_captured=True,
    +        )
    +
    +    monkeypatch.setattr("archivebox.personas.forms.import_persona_from_source", fake_import)
    +
    +    client.login(username="personaadmin", password="testpassword")
    +    response = client.post(
    +        reverse("admin:personas_persona_add"),
    +        {
    +            "name": "ImportedPersona",
    +            "created_by": str(admin_user.pk),
    +            "config": "{}",
    +            "import_mode": "discovered",
    +            "import_discovered_profile": source.choice_value,
    +            "import_copy_profile": "on",
    +            "import_extract_cookies": "on",
    +            "import_capture_storage": "on",
    +            "_save": "Save",
    +        },
    +        HTTP_HOST=ADMIN_HOST,
    +    )
    +
    +    assert response.status_code == 302
    +    persona = Persona.objects.get(name="ImportedPersona")
    +    assert calls["persona_name"] == "ImportedPersona"
    +    assert calls["source"].profile_dir == "Default"
    +    assert calls["kwargs"] == {
    +        "copy_profile": True,
    +        "import_cookies": True,
    +        "capture_storage": True,
    +    }
    +    assert persona.COOKIES_FILE.endswith("cookies.txt")
    +    assert persona.AUTH_STORAGE_FILE.endswith("auth.json")
    diff --git a/archivebox/tests/test_persona_runtime.py b/archivebox/tests/test_persona_runtime.py
    new file mode 100644
    index 0000000000..ff580f609e
    --- /dev/null
    +++ b/archivebox/tests/test_persona_runtime.py
    @@ -0,0 +1,180 @@
    +#!/usr/bin/env python3
    +"""Tests for per-crawl Persona runtime profile management."""
    +
    +import json
    +import textwrap
    +
    +from .conftest import run_python_cwd
    +
    +
    +def test_persona_prepare_runtime_for_crawl_clones_and_cleans_profile(initialized_archive):
    +    script = textwrap.dedent(
    +        """
    +        import json
    +        import os
    +        from pathlib import Path
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +        import django
    +        django.setup()
    +
    +        from archivebox.crawls.models import Crawl
    +        from archivebox.personas.models import Persona
    +
    +        persona, _ = Persona.objects.get_or_create(name='Default')
    +        persona.ensure_dirs()
    +
    +        template_dir = Path(persona.CHROME_USER_DATA_DIR)
    +        (template_dir / 'SingletonLock').write_text('locked')
    +        (template_dir / 'chrome.log').write_text('noise')
    +        (template_dir / 'Default' / 'GPUCache').mkdir(parents=True, exist_ok=True)
    +        (template_dir / 'Default' / 'GPUCache' / 'blob').write_text('cached')
    +        (template_dir / 'Default' / 'Preferences').write_text('{"ok": true}')
    +
    +        crawl = Crawl.objects.create(urls='https://example.com', persona_id=persona.id)
    +        overrides = persona.prepare_runtime_for_crawl(
    +            crawl,
    +            chrome_binary='/Applications/Chromium.app/Contents/MacOS/Chromium',
    +        )
    +
    +        runtime_root = persona.runtime_root_for_crawl(crawl)
    +        runtime_profile = Path(overrides['CHROME_USER_DATA_DIR'])
    +        runtime_downloads = Path(overrides['CHROME_DOWNLOADS_DIR'])
    +
    +        print(json.dumps({
    +            'runtime_root_exists': runtime_root.exists(),
    +            'runtime_profile_exists': runtime_profile.exists(),
    +            'runtime_downloads_exists': runtime_downloads.exists(),
    +            'preferences_copied': (runtime_profile / 'Default' / 'Preferences').exists(),
    +            'singleton_removed': not (runtime_profile / 'SingletonLock').exists(),
    +            'cache_removed': not (runtime_profile / 'Default' / 'GPUCache').exists(),
    +            'log_removed': not (runtime_profile / 'chrome.log').exists(),
    +            'persona_name_recorded': (runtime_root / 'persona_name.txt').read_text().strip(),
    +            'template_dir_recorded': (runtime_root / 'template_dir.txt').read_text().strip(),
    +            'chrome_binary_recorded': (runtime_root / 'chrome_binary.txt').read_text().strip(),
    +        }))
    +        """,
    +    )
    +
    +    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
    +    assert code == 0, stderr
    +
    +    payload = json.loads(stdout.strip().splitlines()[-1])
    +    assert payload["runtime_root_exists"] is True
    +    assert payload["runtime_profile_exists"] is True
    +    assert payload["runtime_downloads_exists"] is True
    +    assert payload["preferences_copied"] is True
    +    assert payload["singleton_removed"] is True
    +    assert payload["cache_removed"] is True
    +    assert payload["log_removed"] is True
    +    assert payload["persona_name_recorded"] == "Default"
    +    assert payload["template_dir_recorded"].endswith("/personas/Default/chrome_user_data")
    +    assert payload["chrome_binary_recorded"] == "/Applications/Chromium.app/Contents/MacOS/Chromium"
    +
    +
    +def test_persona_cleanup_runtime_for_crawl_removes_only_runtime_copy(initialized_archive):
    +    script = textwrap.dedent(
    +        """
    +        import json
    +        import os
    +        from pathlib import Path
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +        import django
    +        django.setup()
    +
    +        from archivebox.crawls.models import Crawl
    +        from archivebox.personas.models import Persona
    +
    +        persona, _ = Persona.objects.get_or_create(name='Default')
    +        persona.ensure_dirs()
    +        template_dir = Path(persona.CHROME_USER_DATA_DIR)
    +        (template_dir / 'Default').mkdir(parents=True, exist_ok=True)
    +        (template_dir / 'Default' / 'Preferences').write_text('{"kept": true}')
    +
    +        crawl = Crawl.objects.create(urls='https://example.com', persona_id=persona.id)
    +        persona.prepare_runtime_for_crawl(crawl)
    +        runtime_root = persona.runtime_root_for_crawl(crawl)
    +
    +        persona.cleanup_runtime_for_crawl(crawl)
    +
    +        print(json.dumps({
    +            'runtime_removed': not runtime_root.exists(),
    +            'template_still_exists': (template_dir / 'Default' / 'Preferences').exists(),
    +        }))
    +        """,
    +    )
    +
    +    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
    +    assert code == 0, stderr
    +
    +    payload = json.loads(stdout.strip().splitlines()[-1])
    +    assert payload["runtime_removed"] is True
    +    assert payload["template_still_exists"] is True
    +
    +
    +def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive):
    +    script = textwrap.dedent(
    +        """
    +        import json
    +        import os
    +        from uuid import uuid4
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +        import django
    +        django.setup()
    +
    +        from archivebox.crawls.models import Crawl
    +        from archivebox.personas.models import Persona
    +
    +        crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
    +
    +        try:
    +            crawl.resolve_persona()
    +        except Persona.DoesNotExist as err:
    +            print(json.dumps({'raised': True, 'message': str(err)}))
    +        else:
    +            raise SystemExit('resolve_persona unexpectedly succeeded')
    +        """,
    +    )
    +
    +    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
    +    assert code == 0, stderr
    +
    +    payload = json.loads(stdout.strip().splitlines()[-1])
    +    assert payload["raised"] is True
    +    assert "references missing Persona" in payload["message"]
    +
    +
    +def test_get_config_raises_for_missing_persona_id(initialized_archive):
    +    script = textwrap.dedent(
    +        """
    +        import json
    +        import os
    +        from uuid import uuid4
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +        import django
    +        django.setup()
    +
    +        from archivebox.config.configset import get_config
    +        from archivebox.crawls.models import Crawl
    +        from archivebox.personas.models import Persona
    +
    +        crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
    +
    +        try:
    +            get_config(crawl=crawl)
    +        except Persona.DoesNotExist as err:
    +            print(json.dumps({'raised': True, 'message': str(err)}))
    +        else:
    +            raise SystemExit('get_config unexpectedly succeeded')
    +        """,
    +    )
    +
    +    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
    +    assert code == 0, stderr
    +
    +    payload = json.loads(stdout.strip().splitlines()[-1])
    +    assert payload["raised"] is True
    +    assert "references missing Persona" in payload["message"]
    diff --git a/archivebox/tests/test_process_runtime_paths.py b/archivebox/tests/test_process_runtime_paths.py
    new file mode 100644
    index 0000000000..e9bb95f0d0
    --- /dev/null
    +++ b/archivebox/tests/test_process_runtime_paths.py
    @@ -0,0 +1,37 @@
    +import os
    +import unittest
    +from pathlib import Path
    +
    +
    +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
    +
    +
    +from archivebox.machine.models import Process
    +
    +
    +class TestProcessRuntimePaths(unittest.TestCase):
    +    def test_hook_processes_use_isolated_runtime_dir(self):
    +        process = Process(
    +            process_type=Process.TypeChoices.HOOK,
    +            pwd="/tmp/archive/example/chrome",
    +            cmd=["node", "/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
    +        )
    +
    +        expected_dir = Path("/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js")
    +        self.assertEqual(process.runtime_dir, expected_dir)
    +        self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
    +        self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
    +        self.assertEqual(process.pid_file, expected_dir / "process.pid")
    +
    +    def test_non_hook_processes_keep_runtime_files_in_pwd(self):
    +        process = Process(
    +            process_type=Process.TypeChoices.WORKER,
    +            pwd="/tmp/archive/example",
    +            cmd=["archivebox", "run", "--snapshot-id", "123"],
    +        )
    +
    +        expected_dir = Path("/tmp/archive/example")
    +        self.assertEqual(process.runtime_dir, expected_dir)
    +        self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
    +        self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
    +        self.assertEqual(process.pid_file, expected_dir / "process.pid")
    diff --git a/archivebox/tests/test_process_service.py b/archivebox/tests/test_process_service.py
    new file mode 100644
    index 0000000000..577f8eb190
    --- /dev/null
    +++ b/archivebox/tests/test_process_service.py
    @@ -0,0 +1,4 @@
    +import pytest
    +
    +
    +pytestmark = pytest.mark.django_db
    diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py
    new file mode 100644
    index 0000000000..9bef4c12b5
    --- /dev/null
    +++ b/archivebox/tests/test_recursive_crawl.py
    @@ -0,0 +1,431 @@
    +#!/usr/bin/env python3
    +"""Integration tests for recursive crawling functionality."""
    +
    +import json
    +import os
    +import subprocess
    +import sqlite3
    +import time
    +from pathlib import Path
    +
    +import pytest
    +
    +
    +def wait_for_db_condition(timeout, condition, interval=0.5):
    +    deadline = time.time() + timeout
    +    while time.time() < deadline:
    +        if os.path.exists("index.sqlite3"):
    +            conn = sqlite3.connect("index.sqlite3")
    +            try:
    +                if condition(conn.cursor()):
    +                    return True
    +            finally:
    +                conn.close()
    +        time.sleep(interval)
    +    return False
    +
    +
    +def stop_process(proc):
    +    if proc.poll() is None:
    +        proc.terminate()
    +        try:
    +            return proc.communicate(timeout=5)
    +        except subprocess.TimeoutExpired:
    +            proc.kill()
    +    return proc.communicate()
    +
    +
    +def run_add_until(args, env, condition, timeout=120):
    +    proc = subprocess.Popen(
    +        args,
    +        stdout=subprocess.PIPE,
    +        stderr=subprocess.PIPE,
    +        text=True,
    +        env=env,
    +    )
    +
    +    assert wait_for_db_condition(timeout=timeout, condition=condition), f"Timed out waiting for condition while running: {' '.join(args)}"
    +    return stop_process(proc)
    +
    +
    +def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recursive_test_site):
    +    """Test that background hooks (.bg.) don't block other extractors from running."""
    +    os.chdir(tmp_path)
    +
    +    # Verify init succeeded
    +    assert process.returncode == 0, f"archivebox init failed: {process.stderr}"
    +
    +    # Enable only parser extractors and background hooks for this test
    +    env = os.environ.copy()
    +    env.update(
    +        {
    +            # Disable most extractors
    +            "SAVE_WGET": "false",
    +            "SAVE_SINGLEFILE": "false",
    +            "SAVE_READABILITY": "false",
    +            "SAVE_MERCURY": "false",
    +            "SAVE_HTMLTOTEXT": "false",
    +            "SAVE_PDF": "false",
    +            "SAVE_SCREENSHOT": "false",
    +            "SAVE_DOM": "false",
    +            "SAVE_HEADERS": "false",
    +            "SAVE_GIT": "false",
    +            "SAVE_YTDLP": "false",
    +            "SAVE_ARCHIVEDOTORG": "false",
    +            "SAVE_TITLE": "false",
    +            "SAVE_FAVICON": "true",
    +        },
    +    )
    +
    +    proc = subprocess.Popen(
    +        ["archivebox", "add", "--depth=1", "--plugins=favicon,parse_html_urls", recursive_test_site["root_url"]],
    +        stdout=subprocess.PIPE,
    +        stderr=subprocess.PIPE,
    +        text=True,
    +        env=env,
    +    )
    +
    +    assert wait_for_db_condition(
    +        timeout=120,
    +        condition=lambda c: (
    +            c.execute(
    +                "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
    +            ).fetchone()[0]
    +            > 0
    +        ),
    +    ), "Parser extractors never progressed beyond queued status"
    +    stdout, stderr = stop_process(proc)
    +
    +    if stderr:
    +        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
    +    if stdout:
    +        print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall()
    +    bg_hooks = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin",
    +    ).fetchall()
    +    parser_extractors = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin",
    +    ).fetchall()
    +    all_extractors = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult ORDER BY plugin",
    +    ).fetchall()
    +
    +    conn.close()
    +
    +    assert len(snapshots) > 0, (
    +        f"Should have created snapshot after Crawl hooks finished. "
    +        f"If this fails, Crawl hooks may be taking too long. "
    +        f"Snapshots: {snapshots}"
    +    )
    +
    +    assert len(all_extractors) > 0, (
    +        f"Should have extractors created for snapshot. If this fails, Snapshot.run() may not have started. Got: {all_extractors}"
    +    )
    +
    +    parser_statuses = [status for _, status in parser_extractors]
    +    assert "started" in parser_statuses or "succeeded" in parser_statuses or "failed" in parser_statuses, (
    +        f"Parser extractors should have run, got statuses: {parser_statuses}. Background hooks: {bg_hooks}"
    +    )
    +
    +
    +def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test_site):
    +    """Test that parser extractors emit Snapshot JSONL to stdout."""
    +    os.chdir(tmp_path)
    +
    +    env = os.environ.copy()
    +    env.update(
    +        {
    +            "SAVE_WGET": "false",
    +            "SAVE_SINGLEFILE": "false",
    +            "SAVE_READABILITY": "false",
    +            "SAVE_MERCURY": "false",
    +            "SAVE_HTMLTOTEXT": "false",
    +            "SAVE_PDF": "false",
    +            "SAVE_SCREENSHOT": "false",
    +            "SAVE_DOM": "false",
    +            "SAVE_HEADERS": "false",
    +            "SAVE_GIT": "false",
    +            "SAVE_YTDLP": "false",
    +            "SAVE_ARCHIVEDOTORG": "false",
    +            "SAVE_TITLE": "false",
    +            "SAVE_FAVICON": "false",
    +            "USE_CHROME": "false",
    +        },
    +    )
    +
    +    result = subprocess.run(
    +        ["archivebox", "add", "--depth=0", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
    +        capture_output=True,
    +        text=True,
    +        env=env,
    +        timeout=60,
    +    )
    +    assert result.returncode == 0, result.stderr
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    parse_html = c.execute(
    +        "SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1",
    +    ).fetchone()
    +
    +    conn.close()
    +
    +    if parse_html:
    +        status = parse_html[1]
    +        output = parse_html[2] or ""
    +
    +        assert status in ["started", "succeeded", "failed"], f"60_parse_html_urls should have run, got status: {status}"
    +
    +        if status == "succeeded" and output:
    +            assert "parsed" in output.lower(), "Parser summary should report parsed URLs"
    +
    +    urls_jsonl_files = list(Path("users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl"))
    +    assert urls_jsonl_files, "parse_html_urls should write urls.jsonl output"
    +
    +    records = []
    +    for line in urls_jsonl_files[0].read_text().splitlines():
    +        if line.strip():
    +            records.append(json.loads(line))
    +
    +    assert records, "urls.jsonl should contain parsed Snapshot records"
    +    assert all(record.get("type") == "Snapshot" for record in records), f"Expected Snapshot JSONL records, got: {records}"
    +
    +
    +def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_test_site):
    +    """Test that recursive crawling creates child snapshots with proper depth and parent_snapshot_id."""
    +    os.chdir(tmp_path)
    +
    +    env = os.environ.copy()
    +    env.update(
    +        {
    +            "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
    +            "SAVE_READABILITY": "false",
    +            "SAVE_SINGLEFILE": "false",
    +            "SAVE_MERCURY": "false",
    +            "SAVE_SCREENSHOT": "false",
    +            "SAVE_PDF": "false",
    +            "SAVE_HEADERS": "false",
    +            "SAVE_ARCHIVEDOTORG": "false",
    +            "SAVE_GIT": "false",
    +            "SAVE_YTDLP": "false",
    +            "SAVE_TITLE": "false",
    +        },
    +    )
    +
    +    stdout, stderr = run_add_until(
    +        ["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
    +        env=env,
    +        timeout=120,
    +        condition=lambda c: (
    +            c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
    +            and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
    +        ),
    +    )
    +
    +    if stderr:
    +        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
    +    if stdout:
    +        print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall()
    +    root_snapshot = c.execute(
    +        "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1",
    +    ).fetchone()
    +    child_snapshots = c.execute(
    +        "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1",
    +    ).fetchall()
    +    crawl = c.execute(
    +        "SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
    +    ).fetchone()
    +    parser_status = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'",
    +        (root_snapshot[0] if root_snapshot else "",),
    +    ).fetchall()
    +    started_extractors = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'",
    +        (root_snapshot[0] if root_snapshot else "",),
    +    ).fetchall()
    +
    +    conn.close()
    +
    +    assert root_snapshot is not None, f"Root snapshot should exist at depth=0. All snapshots: {all_snapshots}"
    +    root_id = root_snapshot[0]
    +
    +    assert crawl is not None, "Crawl should be created"
    +    assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}"
    +
    +    assert len(child_snapshots) > 0, (
    +        f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}"
    +    )
    +
    +    for child_id, child_url, child_depth, parent_id in child_snapshots:
    +        assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}"
    +        assert parent_id == root_id, f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
    +
    +
    +def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict, recursive_test_site):
    +    """Test that recursive crawling stops at max_depth."""
    +    os.chdir(tmp_path)
    +
    +    env = disable_extractors_dict.copy()
    +    env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
    +
    +    stdout, stderr = run_add_until(
    +        ["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
    +        env=env,
    +        timeout=120,
    +        condition=lambda c: (
    +            c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
    +            and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
    +            and c.execute(
    +                "SELECT COUNT(DISTINCT ar.snapshot_id) "
    +                "FROM core_archiveresult ar "
    +                "JOIN core_snapshot s ON s.id = ar.snapshot_id "
    +                "WHERE s.depth = 1 "
    +                "AND ar.plugin LIKE 'parse_%_urls' "
    +                "AND ar.status IN ('started', 'succeeded', 'failed')",
    +            ).fetchone()[0]
    +            >= len(recursive_test_site["child_urls"])
    +        ),
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    max_depth_found = c.execute(
    +        "SELECT MAX(depth) FROM core_snapshot",
    +    ).fetchone()[0]
    +    depth_counts = c.execute(
    +        "SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth",
    +    ).fetchall()
    +
    +    conn.close()
    +
    +    assert max_depth_found is not None, "Should have at least one snapshot"
    +    assert max_depth_found <= 1, f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
    +
    +
    +def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_extractors_dict):
    +    """Test that Snapshot model has parent_snapshot field."""
    +    os.chdir(tmp_path)
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    # Check schema for parent_snapshot_id column
    +    schema = c.execute("PRAGMA table_info(core_snapshot)").fetchall()
    +    conn.close()
    +
    +    column_names = [col[1] for col in schema]
    +
    +    assert "parent_snapshot_id" in column_names, f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
    +
    +
    +def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict):
    +    """Test that Snapshot model has depth field."""
    +    os.chdir(tmp_path)
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    # Check schema for depth column
    +    schema = c.execute("PRAGMA table_info(core_snapshot)").fetchall()
    +    conn.close()
    +
    +    column_names = [col[1] for col in schema]
    +
    +    assert "depth" in column_names, f"Snapshot table should have depth column. Columns: {column_names}"
    +
    +
    +def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict, recursive_test_site):
    +    """Test that root snapshots are created with depth=0."""
    +    os.chdir(tmp_path)
    +
    +    env = disable_extractors_dict.copy()
    +    env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
    +
    +    stdout, stderr = run_add_until(
    +        ["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
    +        env=env,
    +        timeout=120,
    +        condition=lambda c: (
    +            c.execute(
    +                "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
    +                (recursive_test_site["root_url"],),
    +            ).fetchone()[0]
    +            >= 1
    +        ),
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    snapshot = c.execute(
    +        "SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1",
    +        (recursive_test_site["root_url"],),
    +    ).fetchone()
    +
    +    conn.close()
    +
    +    assert snapshot is not None, "Root snapshot should be created"
    +    assert snapshot[1] == 0, f"Root snapshot should have depth=0, got {snapshot[1]}"
    +
    +
    +def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process, recursive_test_site):
    +    """Test that background hooks don't block foreground extractors from running."""
    +    os.chdir(tmp_path)
    +
    +    env = os.environ.copy()
    +    env.update(
    +        {
    +            "SAVE_WGET": "true",
    +            "SAVE_SINGLEFILE": "false",
    +            "SAVE_PDF": "false",
    +            "SAVE_SCREENSHOT": "false",
    +            "SAVE_FAVICON": "true",
    +        },
    +    )
    +
    +    stdout, stderr = run_add_until(
    +        ["archivebox", "add", "--plugins=favicon,wget,parse_html_urls", recursive_test_site["root_url"]],
    +        env=env,
    +        timeout=120,
    +        condition=lambda c: (
    +            c.execute(
    +                "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
    +            ).fetchone()[0]
    +            > 0
    +        ),
    +    )
    +
    +    conn = sqlite3.connect("index.sqlite3")
    +    c = conn.cursor()
    +
    +    bg_results = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')",
    +    ).fetchall()
    +    parser_status = c.execute(
    +        "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'",
    +    ).fetchall()
    +
    +    conn.close()
    +
    +    if len(bg_results) > 0:
    +        parser_statuses = [status for _, status in parser_status]
    +        non_queued = [s for s in parser_statuses if s != "queued"]
    +        assert len(non_queued) > 0 or len(parser_status) == 0, (
    +            f"With {len(bg_results)} background hooks started, parser extractors should still run. Got statuses: {parser_statuses}"
    +        )
    +
    +
    +if __name__ == "__main__":
    +    pytest.main([__file__, "-v"])
    diff --git a/archivebox/tests/test_runner.py b/archivebox/tests/test_runner.py
    new file mode 100644
    index 0000000000..d832d07a83
    --- /dev/null
    +++ b/archivebox/tests/test_runner.py
    @@ -0,0 +1,886 @@
    +import asyncio
    +import json
    +import subprocess
    +import sys
    +from pathlib import Path
    +from types import SimpleNamespace
    +
    +import pytest
    +from django.test import RequestFactory
    +
    +
    +pytestmark = pytest.mark.django_db
    +
    +
    +class _DummyBus:
    +    def __init__(self, name: str):
    +        self.name = name
    +        self.registrations = []
    +
    +    def on(self, event_pattern, handler):
    +        registration = SimpleNamespace(event_pattern=event_pattern, handler=handler)
    +        self.registrations.append(registration)
    +        return registration
    +
    +    def off(self, event_pattern, registration):
    +        self.registrations = [existing for existing in self.registrations if existing is not registration]
    +
    +    async def stop(self):
    +        return None
    +
    +
    +class _DummyService:
    +    def __init__(self, *args, **kwargs):
    +        pass
    +
    +
    +def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://blog.sweeting.me\nhttps://sweeting.me",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +    snapshot_a = Snapshot.objects.create(
    +        url="https://blog.sweeting.me",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.QUEUED,
    +    )
    +    snapshot_b = Snapshot.objects.create(
    +        url="https://sweeting.me",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.QUEUED,
    +    )
    +
    +    created_buses: list[_DummyBus] = []
    +
    +    def fake_create_bus(*, name, total_timeout=3600.0, **kwargs):
    +        bus = _DummyBus(name)
    +        created_buses.append(bus)
    +        return bus
    +
    +    monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
    +    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    +    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
    +    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    +    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    +    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
    +    monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
    +    monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
    +    monkeypatch.setattr(runner_module, "_emit_machine_config", lambda *args, **kwargs: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
    +
    +    download_calls = []
    +
    +    async def fake_download(*, url, bus, config_overrides, **kwargs):
    +        extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
    +        download_calls.append(
    +            {
    +                "url": url,
    +                "bus": bus,
    +                "snapshot_id": extra_context["snapshot_id"],
    +                "source_url": url,
    +            },
    +        )
    +        await asyncio.sleep(0)
    +        return []
    +
    +    monkeypatch.setattr(runner_module, "download", fake_download)
    +
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +    snapshot_data = {
    +        str(snapshot_a.id): {
    +            "id": str(snapshot_a.id),
    +            "url": snapshot_a.url,
    +            "status": snapshot_a.status,
    +            "title": snapshot_a.title,
    +            "timestamp": snapshot_a.timestamp,
    +            "bookmarked_at": snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
    +            "created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
    +            "tags": snapshot_a.tags_str(),
    +            "depth": snapshot_a.depth,
    +            "output_dir": str(snapshot_a.output_dir),
    +            "config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
    +        },
    +        str(snapshot_b.id): {
    +            "id": str(snapshot_b.id),
    +            "url": snapshot_b.url,
    +            "status": snapshot_b.status,
    +            "title": snapshot_b.title,
    +            "timestamp": snapshot_b.timestamp,
    +            "bookmarked_at": snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
    +            "created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
    +            "tags": snapshot_b.tags_str(),
    +            "depth": snapshot_b.depth,
    +            "output_dir": str(snapshot_b.output_dir),
    +            "config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
    +        },
    +    }
    +    monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
    +
    +    async def run_both():
    +        await asyncio.gather(
    +            crawl_runner.run_snapshot(str(snapshot_a.id)),
    +            crawl_runner.run_snapshot(str(snapshot_b.id)),
    +        )
    +
    +    asyncio.run(run_both())
    +
    +    assert len(download_calls) == 2
    +    assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
    +    assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
    +    assert len({id(call["bus"]) for call in download_calls}) == 1
    +    assert len(created_buses) == 1
    +
    +
    +def test_ensure_background_runner_starts_when_none_running(monkeypatch):
    +    import archivebox.machine.models as machine_models
    +    from archivebox.services import runner as runner_module
    +
    +    popen_calls = []
    +
    +    class DummyPopen:
    +        def __init__(self, args, **kwargs):
    +            popen_calls.append((args, kwargs))
    +
    +    monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
    +    monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
    +    monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
    +    monkeypatch.setattr(
    +        machine_models.Process.objects,
    +        "filter",
    +        lambda **kwargs: SimpleNamespace(exists=lambda: False),
    +    )
    +    monkeypatch.setattr(runner_module.subprocess, "Popen", DummyPopen)
    +
    +    started = runner_module.ensure_background_runner(allow_under_pytest=True)
    +
    +    assert started is True
    +    assert len(popen_calls) == 1
    +    assert popen_calls[0][0] == [runner_module.sys.executable, "-m", "archivebox", "run", "--daemon"]
    +    assert popen_calls[0][1]["stdin"] is subprocess.DEVNULL
    +
    +
    +def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
    +    import archivebox.machine.models as machine_models
    +    from archivebox.services import runner as runner_module
    +
    +    monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
    +    monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
    +    monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
    +    monkeypatch.setattr(
    +        machine_models.Process.objects,
    +        "filter",
    +        lambda **kwargs: SimpleNamespace(exists=lambda: True),
    +    )
    +    monkeypatch.setattr(
    +        runner_module.subprocess,
    +        "Popen",
    +        lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("runner should not be spawned")),
    +    )
    +
    +    started = runner_module.ensure_background_runner(allow_under_pytest=True)
    +
    +    assert started is False
    +
    +
    +def test_runner_prepare_refreshes_network_interface_and_attaches_current_process(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +
    +    class _Iface:
    +        id = "iface-1"
    +        machine = SimpleNamespace(id="machine-1")
    +        machine_id = "machine-1"
    +
    +    saved_updates = []
    +
    +    class _Proc:
    +        iface_id = None
    +        machine_id = "machine-1"
    +        iface = None
    +        machine = None
    +
    +        def save(self, *, update_fields):
    +            saved_updates.append(tuple(update_fields))
    +
    +    proc = _Proc()
    +
    +    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    +    monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
    +    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
    +    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    +    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    +    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
    +    monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
    +    monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
    +
    +    from archivebox.machine.models import NetworkInterface, Process
    +    from archivebox.config import configset as configset_module
    +
    +    refresh_calls = []
    +    monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
    +    monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
    +    monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
    +
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +    crawl_runner.load_run_state()
    +
    +    assert refresh_calls == [True]
    +    assert proc.iface is not None
    +    assert proc.machine == proc.iface.machine
    +    assert saved_updates == [("iface", "machine", "modified_at")]
    +
    +
    +def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
    +    from archivebox.machine.models import Machine, NetworkInterface, Process
    +    from archivebox.services import runner as runner_module
    +    from archivebox.config import configset as configset_module
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +
    +    machine = Machine.objects.create(
    +        guid="test-guid-runner-overrides",
    +        hostname="runner-host",
    +        hw_in_docker=False,
    +        hw_in_vm=False,
    +        hw_manufacturer="Test",
    +        hw_product="Test Product",
    +        hw_uuid="test-hw-runner-overrides",
    +        os_arch="arm64",
    +        os_family="darwin",
    +        os_platform="macOS",
    +        os_release="14.0",
    +        os_kernel="Darwin",
    +        stats={},
    +        config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
    +    )
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +    proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
    +
    +    monkeypatch.setattr(
    +        NetworkInterface,
    +        "current",
    +        classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
    +    )
    +    monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
    +    monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
    +    monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
    +
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +    crawl_runner.load_run_state()
    +
    +    assert crawl_runner.derived_config == machine.config
    +
    +
    +def test_load_run_state_uses_enabled_plugins_when_plugins_key_missing(monkeypatch):
    +    from archivebox.machine.models import Machine, NetworkInterface, Process
    +    from archivebox.services import runner as runner_module
    +    from archivebox.config import configset as configset_module
    +    from archivebox import hooks as hooks_module
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from pathlib import Path
    +
    +    machine = Machine.objects.create(
    +        guid="test-guid-runner-missing-plugins",
    +        hostname="runner-host-missing-plugins",
    +        hw_in_docker=False,
    +        hw_in_vm=False,
    +        hw_manufacturer="Test",
    +        hw_product="Test Product",
    +        hw_uuid="test-hw-runner-missing-plugins",
    +        os_arch="arm64",
    +        os_family="darwin",
    +        os_platform="macOS",
    +        os_release="14.0",
    +        os_kernel="Darwin",
    +        stats={},
    +        config={},
    +    )
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +    proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
    +
    +    monkeypatch.setattr(
    +        NetworkInterface,
    +        "current",
    +        classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
    +    )
    +    monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
    +    monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
    +    monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"CHROME_BINARY": "", "TIMEOUT": 60})
    +    monkeypatch.setattr(
    +        hooks_module,
    +        "discover_hooks",
    +        lambda event_name, config=None: (
    +            [
    +                Path(f"/tmp/{event_name.lower()}/wget/on_{event_name}__test.py"),
    +                Path(f"/tmp/{event_name.lower()}/favicon/on_{event_name}__test.py"),
    +            ]
    +            if event_name in {"CrawlSetup", "Snapshot"}
    +            else []
    +        ),
    +    )
    +
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +    snapshot_ids = crawl_runner.load_run_state()
    +
    +    assert crawl_runner.selected_plugins == ["favicon", "wget"]
    +    assert len(snapshot_ids) == 1
    +
    +
    +def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        max_size=16,
    +    )
    +
    +    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    +    monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
    +    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
    +    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    +    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    +    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
    +    monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
    +    monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
    +    monkeypatch.setattr(
    +        runner_module,
    +        "download",
    +        lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("snapshot download should have been skipped")),
    +    )
    +
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +    state_dir = tmp_path / ".abx-dl"
    +    state_dir.mkdir(parents=True, exist_ok=True)
    +    (state_dir / "limits.json").write_text(
    +        json.dumps(
    +            {
    +                "admitted_snapshot_ids": ["child-1"],
    +                "counted_process_ids": ["proc-1"],
    +                "total_size": 32,
    +                "stop_reason": "max_size",
    +            },
    +        ),
    +        encoding="utf-8",
    +    )
    +    cancelled: list[str] = []
    +    crawl_runner.load_snapshot_payload = lambda snapshot_id: {
    +        "id": snapshot_id,
    +        "url": "https://example.com/child",
    +        "title": "",
    +        "timestamp": "",
    +        "bookmarked_at": "",
    +        "created_at": "",
    +        "tags": "",
    +        "depth": 1,
    +        "status": "queued",
    +        "output_dir": "/tmp/child",
    +        "config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
    +    }
    +    crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
    +
    +    asyncio.run(crawl_runner.run_snapshot("child-1"))
    +
    +    assert cancelled == ["child-1"]
    +
    +
    +@pytest.mark.django_db(transaction=True)
    +def test_seal_snapshot_cancels_queued_descendants_after_max_size():
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services.snapshot_service import SnapshotService
    +    from abx_dl.events import SnapshotCompletedEvent
    +    from abx_dl.orchestrator import create_bus
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        max_size=16,
    +    )
    +    root = Snapshot.objects.create(
    +        url="https://example.com",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.STARTED,
    +    )
    +    child = Snapshot.objects.create(
    +        url="https://example.com/child",
    +        crawl=crawl,
    +        depth=1,
    +        parent_snapshot_id=root.id,
    +        status=Snapshot.StatusChoices.QUEUED,
    +    )
    +
    +    state_dir = Path(crawl.output_dir) / ".abx-dl"
    +    state_dir.mkdir(parents=True, exist_ok=True)
    +    (state_dir / "limits.json").write_text(
    +        json.dumps(
    +            {
    +                "admitted_snapshot_ids": [str(root.id), str(child.id)],
    +                "counted_process_ids": ["proc-1"],
    +                "total_size": 32,
    +                "stop_reason": "max_size",
    +            },
    +        ),
    +        encoding="utf-8",
    +    )
    +
    +    bus = create_bus(name="test_snapshot_limit_cancel")
    +    service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
    +    try:
    +
    +        async def emit_event() -> None:
    +            await service.on_SnapshotCompletedEvent(
    +                SnapshotCompletedEvent(
    +                    url=root.url,
    +                    snapshot_id=str(root.id),
    +                    output_dir=str(root.output_dir),
    +                ),
    +            )
    +
    +        asyncio.run(emit_event())
    +    finally:
    +        asyncio.run(bus.stop())
    +
    +    root.refresh_from_db()
    +    child.refresh_from_db()
    +    assert root.status == Snapshot.StatusChoices.SEALED
    +    assert child.status == Snapshot.StatusChoices.SEALED
    +    assert child.retry_at is None
    +
    +
    +def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
    +    from django.contrib.auth import get_user_model
    +    from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
    +
    +    user = get_user_model().objects.create_superuser(
    +        username="runner-api-admin",
    +        email="runner-api-admin@example.com",
    +        password="testpassword",
    +    )
    +    request = RequestFactory().post("/api/v1/crawls")
    +    request.user = user
    +
    +    crawl = create_crawl(
    +        request,
    +        CrawlCreateSchema(
    +            urls=["https://example.com"],
    +            max_depth=0,
    +            tags=[],
    +            tags_str="",
    +            label="",
    +            notes="",
    +            config={},
    +        ),
    +    )
    +
    +    assert str(crawl.id)
    +    assert crawl.status == "queued"
    +    assert crawl.retry_at is not None
    +
    +
    +def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.STARTED,
    +    )
    +    snapshot = Snapshot.objects.create(
    +        url="https://example.com",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.STARTED,
    +    )
    +
    +    monkeypatch.setattr(runner_module, "_emit_machine_config", lambda *args, **kwargs: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
    +    monkeypatch.setattr(
    +        runner_module.CrawlRunner,
    +        "load_snapshot_payload",
    +        lambda self, _snapshot_id: {
    +            "id": str(snapshot.id),
    +            "url": snapshot.url,
    +            "depth": snapshot.depth,
    +            "output_dir": str(snapshot.output_dir),
    +        },
    +    )
    +    monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
    +
    +    asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
    +
    +    crawl.refresh_from_db()
    +    assert crawl.status != Crawl.StatusChoices.SEALED
    +    assert crawl.retry_at is not None
    +
    +
    +def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.STARTED,
    +    )
    +    snapshot = Snapshot.objects.create(
    +        url="https://example.com",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.STARTED,
    +    )
    +
    +    monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
    +    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    +    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
    +    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    +    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    +    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
    +    monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
    +    monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
    +    monkeypatch.setattr(runner_module, "_emit_machine_config", lambda *args, **kwargs: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
    +    monkeypatch.setattr(
    +        runner_module.CrawlRunner,
    +        "load_snapshot_payload",
    +        lambda self, _snapshot_id: {
    +            "id": str(snapshot.id),
    +            "url": snapshot.url,
    +            "depth": snapshot.depth,
    +            "output_dir": str(snapshot.output_dir),
    +        },
    +    )
    +    monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
    +
    +    method_calls: list[str] = []
    +
    +    def wrapped_finalize(self):
    +        method_calls.append("finalize_run_state")
    +        return None
    +
    +    def wrapped_load(self):
    +        method_calls.append("load_run_state")
    +        return [str(snapshot.id)]
    +
    +    monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
    +
    +    asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
    +
    +    crawl.refresh_from_db()
    +    assert crawl.status == Crawl.StatusChoices.STARTED
    +    assert crawl.retry_at is not None
    +    assert method_calls == ["load_run_state", "finalize_run_state"]
    +
    +
    +def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +
    +    async def run_test():
    +        task = asyncio.get_running_loop().create_future()
    +        task.set_exception(RuntimeError("snapshot failed"))
    +        crawl_runner.snapshot_tasks["snap-1"] = task
    +        with pytest.raises(RuntimeError, match="snapshot failed"):
    +            await crawl_runner.wait_for_snapshot_tasks()
    +
    +    asyncio.run(run_test())
    +
    +
    +def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +    )
    +    crawl_runner = runner_module.CrawlRunner(crawl)
    +
    +    async def finish_snapshot() -> None:
    +        await asyncio.sleep(0)
    +
    +    async def run_test():
    +        task = asyncio.create_task(finish_snapshot())
    +        crawl_runner.snapshot_tasks["snap-1"] = task
    +        await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
    +        assert crawl_runner.snapshot_tasks == {}
    +
    +    asyncio.run(run_test())
    +
    +
    +def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.STARTED,
    +    )
    +    snapshot = Snapshot.objects.create(
    +        url="https://example.com",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.STARTED,
    +    )
    +
    +    monkeypatch.setattr(runner_module, "_emit_machine_config", lambda *args, **kwargs: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
    +    monkeypatch.setattr(
    +        runner_module.CrawlRunner,
    +        "load_snapshot_payload",
    +        lambda self, _snapshot_id: {
    +            "id": str(snapshot.id),
    +            "url": snapshot.url,
    +            "depth": snapshot.depth,
    +            "output_dir": str(snapshot.output_dir),
    +        },
    +    )
    +    monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
    +    monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
    +    monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
    +
    +    cleanup_calls = []
    +    monkeypatch.setattr(
    +        runner_module.CrawlRunner,
    +        "run_crawl_cleanup",
    +        lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
    +    )
    +    asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
    +
    +    assert cleanup_calls == ["abx_cleanup"]
    +
    +
    +def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
    +    from abx_dl.events import ProcessCompletedEvent, ProcessEvent
    +    from abx_dl.services.process_service import ProcessService
    +
    +    service = object.__new__(ProcessService)
    +    service.emit_jsonl = False
    +    service.interactive_tty = False
    +    service.pause_requested = asyncio.Event()
    +    service.abort_requested = False
    +    emitted_events = []
    +
    +    class FakeBus:
    +        async def emit(self, event):
    +            emitted_events.append(event)
    +            return event
    +
    +    service.bus = FakeBus()
    +
    +    async def fake_stream_stdout(**kwargs):
    +        try:
    +            await asyncio.Event().wait()
    +        except asyncio.CancelledError:
    +            return ["daemon output\n"]
    +
    +    monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
    +
    +    plugin_output_dir = tmp_path / "chrome"
    +    plugin_output_dir.mkdir()
    +    # stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
    +    stderr_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stderr.log"
    +    stderr_file.write_text("")
    +    pid_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.pid"
    +    pid_file.write_text("12345")
    +
    +    async def run_test():
    +        event = ProcessEvent(
    +            plugin_name="chrome",
    +            hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
    +            hook_path=sys.executable,
    +            hook_args=["-c", "pass"],
    +            env={},
    +            output_dir=str(plugin_output_dir),
    +            timeout=60,
    +            is_background=True,
    +            url="https://example.org/",
    +            process_type="hook",
    +            worker_type="hook",
    +        )
    +        await asyncio.wait_for(
    +            service.on_ProcessEvent(event),
    +            timeout=0.5,
    +        )
    +
    +    asyncio.run(run_test())
    +
    +    assert pid_file.exists() is False
    +    assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
    +
    +
    +def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    crawl = Crawl.objects.create(
    +        urls="https://example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.SEALED,
    +    )
    +    snapshot = Snapshot.objects.create(
    +        url="https://example.com",
    +        crawl=crawl,
    +        status=Snapshot.StatusChoices.QUEUED,
    +        retry_at=runner_module.timezone.now(),
    +    )
    +
    +    monkeypatch.setattr(type(snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +    monkeypatch.setattr(type(crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +
    +    run_calls: list[tuple[str, list[str] | None, bool]] = []
    +
    +    def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
    +        run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
    +        snapshot.status = Snapshot.StatusChoices.SEALED
    +        snapshot.retry_at = None
    +        snapshot.save(update_fields=["status", "retry_at", "modified_at"])
    +
    +    monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
    +
    +    result = runner_module.run_pending_crawls(daemon=False)
    +
    +    assert result == 0
    +    assert run_calls == [(str(crawl.id), [str(snapshot.id)], False)]
    +
    +
    +def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.core.models import Snapshot
    +    from archivebox.services import runner as runner_module
    +
    +    older_crawl = Crawl.objects.create(
    +        urls="https://older.example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.STARTED,
    +    )
    +    older_snapshot = Snapshot.objects.create(
    +        url="https://older.example.com",
    +        crawl=older_crawl,
    +        status=Snapshot.StatusChoices.QUEUED,
    +        retry_at=runner_module.timezone.now(),
    +    )
    +    newer_crawl = Crawl.objects.create(
    +        urls="https://newer.example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.QUEUED,
    +        retry_at=runner_module.timezone.now(),
    +    )
    +
    +    monkeypatch.setattr(type(older_snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +    monkeypatch.setattr(type(older_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +    monkeypatch.setattr(type(newer_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +
    +    run_calls: list[tuple[str, list[str] | None, bool]] = []
    +
    +    class _StopScheduling(Exception):
    +        pass
    +
    +    def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
    +        run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
    +        raise _StopScheduling
    +
    +    monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
    +
    +    with pytest.raises(_StopScheduling):
    +        runner_module.run_pending_crawls(daemon=False)
    +
    +    assert run_calls == [(str(newer_crawl.id), None, False)]
    +
    +
    +def test_run_pending_crawls_prioritizes_queued_crawl_before_unrelated_binary_backlog(monkeypatch):
    +    from archivebox.base_models.models import get_or_create_system_user_pk
    +    from archivebox.crawls.models import Crawl
    +    from archivebox.machine.models import Binary, Machine
    +    from archivebox.services import runner as runner_module
    +
    +    queued_crawl = Crawl.objects.create(
    +        urls="https://scheduled.example.com",
    +        created_by_id=get_or_create_system_user_pk(),
    +        status=Crawl.StatusChoices.QUEUED,
    +        retry_at=runner_module.timezone.now(),
    +    )
    +    unrelated_binary = Binary.objects.create(
    +        machine=Machine.current(),
    +        name="papers-dl",
    +        status=Binary.StatusChoices.QUEUED,
    +        retry_at=runner_module.timezone.now(),
    +    )
    +
    +    monkeypatch.setattr(type(queued_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +    monkeypatch.setattr(type(unrelated_binary), "claim_processing_lock", lambda self, lock_seconds=60: True)
    +
    +    run_calls: list[tuple[str, list[str] | None, bool]] = []
    +    binary_calls: list[str] = []
    +
    +    class _StopScheduling(Exception):
    +        pass
    +
    +    def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
    +        run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
    +        raise _StopScheduling
    +
    +    def fake_run_binary(binary_id):
    +        binary_calls.append(binary_id)
    +
    +    monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
    +    monkeypatch.setattr(runner_module, "run_binary", fake_run_binary)
    +
    +    with pytest.raises(_StopScheduling):
    +        runner_module.run_pending_crawls(daemon=False)
    +
    +    assert run_calls == [(str(queued_crawl.id), None, False)]
    +    assert binary_calls == []
    diff --git a/archivebox/tests/test_savepagenow.py b/archivebox/tests/test_savepagenow.py
    new file mode 100644
    index 0000000000..5a34f47627
    --- /dev/null
    +++ b/archivebox/tests/test_savepagenow.py
    @@ -0,0 +1,330 @@
    +"""Integration tests for /web/https://... shortcut (Save Page Now)."""
    +
    +import os
    +import subprocess
    +import sys
    +import textwrap
    +from pathlib import Path
    +
    +from archivebox.tests.conftest import create_test_url
    +
    +
    +ADMIN_HOST = "admin.archivebox.localhost:8000"
    +
    +
    +def _run_savepagenow_script(
    +    initialized_archive: Path,
    +    request_url: str,
    +    expected_url: str,
    +    *,
    +    login: bool,
    +    public_add_view: bool,
    +    host: str,
    +):
    +    script = textwrap.dedent(
    +        f"""
    +        import os
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +
    +        from archivebox.config.django import setup_django
    +        setup_django()
    +
    +        from django.test import Client
    +        from django.contrib.auth import get_user_model
    +        from archivebox.core.models import Snapshot
    +
    +        client = Client()
    +        if {login!r}:
    +            user = get_user_model().objects.create_user(username='tester', password='pw')
    +            client.force_login(user)
    +
    +        target_url = {request_url!r}
    +
    +        resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
    +        assert resp.status_code == 302, resp.status_code
    +
    +        snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
    +        if snapshot is None:
    +            raise AssertionError(
    +                "snapshot not created; status=%s location=%s count=%s"
    +                % (
    +                    resp.status_code,
    +                    resp.get('Location'),
    +                    Snapshot.objects.count(),
    +                )
    +            )
    +        assert resp['Location'] == f"/{{snapshot.url_path}}"
    +
    +        resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
    +        assert resp2.status_code == 302, resp2.status_code
    +        assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
    +        assert resp2['Location'] == f"/{{snapshot.url_path}}"
    +        """,
    +    )
    +
    +    env = {
    +        **os.environ,
    +        "DATA_DIR": str(initialized_archive),
    +        "USE_COLOR": "False",
    +        "SHOW_PROGRESS": "False",
    +        "PUBLIC_ADD_VIEW": "True" if public_add_view else "False",
    +        "SAVE_ARCHIVEDOTORG": "False",
    +        "SAVE_TITLE": "False",
    +        "SAVE_FAVICON": "False",
    +        "SAVE_WGET": "False",
    +        "SAVE_WARC": "False",
    +        "SAVE_PDF": "False",
    +        "SAVE_SCREENSHOT": "False",
    +        "SAVE_DOM": "False",
    +        "SAVE_SINGLEFILE": "False",
    +        "SAVE_READABILITY": "False",
    +        "SAVE_MERCURY": "False",
    +        "SAVE_GIT": "False",
    +        "SAVE_YTDLP": "False",
    +        "SAVE_HEADERS": "False",
    +        "SAVE_HTMLTOTEXT": "False",
    +    }
    +
    +    return subprocess.run(
    +        [sys.executable, "-c", script],
    +        cwd=initialized_archive,
    +        env=env,
    +        text=True,
    +        capture_output=True,
    +        timeout=60,
    +    )
    +
    +
    +def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
    +    script = textwrap.dedent(
    +        f"""
    +        import os
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +
    +        from archivebox.config.django import setup_django
    +        setup_django()
    +
    +        from django.test import Client
    +        from archivebox.core.models import Snapshot
    +
    +        client = Client()
    +        target_url = {request_url!r}
    +
    +        resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
    +        assert resp.status_code == 302, resp.status_code
    +        assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
    +        assert Snapshot.objects.count() == 0
    +        """,
    +    )
    +
    +    env = {
    +        **os.environ,
    +        "DATA_DIR": str(initialized_archive),
    +        "USE_COLOR": "False",
    +        "SHOW_PROGRESS": "False",
    +        "PUBLIC_ADD_VIEW": "False",
    +        "SAVE_ARCHIVEDOTORG": "False",
    +        "SAVE_TITLE": "False",
    +        "SAVE_FAVICON": "False",
    +        "SAVE_WGET": "False",
    +        "SAVE_WARC": "False",
    +        "SAVE_PDF": "False",
    +        "SAVE_SCREENSHOT": "False",
    +        "SAVE_DOM": "False",
    +        "SAVE_SINGLEFILE": "False",
    +        "SAVE_READABILITY": "False",
    +        "SAVE_MERCURY": "False",
    +        "SAVE_GIT": "False",
    +        "SAVE_YTDLP": "False",
    +        "SAVE_HEADERS": "False",
    +        "SAVE_HTMLTOTEXT": "False",
    +    }
    +
    +    return subprocess.run(
    +        [sys.executable, "-c", script],
    +        cwd=initialized_archive,
    +        env=env,
    +        text=True,
    +        capture_output=True,
    +        timeout=60,
    +    )
    +
    +
    +def _run_savepagenow_via_web_host_redirect_script(initialized_archive: Path, request_url: str, expected_url: str):
    +    script = textwrap.dedent(
    +        f"""
    +        import os
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +
    +        from archivebox.config.django import setup_django
    +        setup_django()
    +
    +        from django.test import Client
    +        from django.contrib.auth import get_user_model
    +        from archivebox.core.models import Snapshot
    +
    +        client = Client()
    +        user = get_user_model().objects.create_user(username='tester', password='pw')
    +        client.force_login(user)
    +
    +        target_url = {request_url!r}
    +
    +        resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
    +        assert resp.status_code == 302, resp.status_code
    +        assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
    +
    +        resp2 = client.get('/web/' + target_url, HTTP_HOST={ADMIN_HOST!r})
    +        assert resp2.status_code == 302, resp2.status_code
    +
    +        snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
    +        assert snapshot is not None
    +        assert resp2['Location'] == f"/{{snapshot.url_path}}"
    +        assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
    +        """,
    +    )
    +
    +    env = {
    +        **os.environ,
    +        "DATA_DIR": str(initialized_archive),
    +        "USE_COLOR": "False",
    +        "SHOW_PROGRESS": "False",
    +        "PUBLIC_ADD_VIEW": "False",
    +        "SAVE_ARCHIVEDOTORG": "False",
    +        "SAVE_TITLE": "False",
    +        "SAVE_FAVICON": "False",
    +        "SAVE_WGET": "False",
    +        "SAVE_WARC": "False",
    +        "SAVE_PDF": "False",
    +        "SAVE_SCREENSHOT": "False",
    +        "SAVE_DOM": "False",
    +        "SAVE_SINGLEFILE": "False",
    +        "SAVE_READABILITY": "False",
    +        "SAVE_MERCURY": "False",
    +        "SAVE_GIT": "False",
    +        "SAVE_YTDLP": "False",
    +        "SAVE_HEADERS": "False",
    +        "SAVE_HTMLTOTEXT": "False",
    +    }
    +
    +    return subprocess.run(
    +        [sys.executable, "-c", script],
    +        cwd=initialized_archive,
    +        env=env,
    +        text=True,
    +        capture_output=True,
    +        timeout=60,
    +    )
    +
    +
    +def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
    +    script = textwrap.dedent(
    +        f"""
    +        import os
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    +
    +        from archivebox.config.django import setup_django
    +        setup_django()
    +
    +        from django.test import Client
    +        from archivebox.core.models import Snapshot
    +        from archivebox.crawls.models import Crawl
    +        from archivebox.base_models.models import get_or_create_system_user_pk
    +
    +        target_url = {request_url!r}
    +        stored_url = {stored_url!r}
    +        created_by_id = get_or_create_system_user_pk()
    +        crawl = Crawl.objects.create(urls=stored_url, created_by_id=created_by_id)
    +        snapshot = Snapshot.objects.create(url=stored_url, crawl=crawl)
    +
    +        client = Client()
    +        resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
    +        assert resp.status_code == 302, resp.status_code
    +        assert resp['Location'] == f"/{{snapshot.url_path}}"
    +        """,
    +    )
    +
    +    env = {
    +        **os.environ,
    +        "DATA_DIR": str(initialized_archive),
    +        "USE_COLOR": "False",
    +        "SHOW_PROGRESS": "False",
    +        "PUBLIC_ADD_VIEW": "False",
    +        "SAVE_ARCHIVEDOTORG": "False",
    +        "SAVE_TITLE": "False",
    +        "SAVE_FAVICON": "False",
    +        "SAVE_WGET": "False",
    +        "SAVE_WARC": "False",
    +        "SAVE_PDF": "False",
    +        "SAVE_SCREENSHOT": "False",
    +        "SAVE_DOM": "False",
    +        "SAVE_SINGLEFILE": "False",
    +        "SAVE_READABILITY": "False",
    +        "SAVE_MERCURY": "False",
    +        "SAVE_GIT": "False",
    +        "SAVE_YTDLP": "False",
    +        "SAVE_HEADERS": "False",
    +        "SAVE_HTMLTOTEXT": "False",
    +    }
    +
    +    return subprocess.run(
    +        [sys.executable, "-c", script],
    +        cwd=initialized_archive,
    +        env=env,
    +        text=True,
    +        capture_output=True,
    +        timeout=60,
    +    )
    +
    +
    +def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
    +    """/web/https://... should work for authenticated users even when public add is off."""
    +    url = create_test_url(domain="example.com", path="savepagenow-auth")
    +    request_url = url.replace("https://", "")
    +    result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
    +    assert result.returncode == 0, f"SavePageNow shortcut (logged-in) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
    +
    +
    +def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
    +    """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
    +    url = create_test_url(domain="example.com", path="savepagenow-public")
    +    request_url = url
    +    result = _run_savepagenow_script(
    +        initialized_archive,
    +        request_url,
    +        url,
    +        login=False,
    +        public_add_view=True,
    +        host="web.archivebox.localhost:8000",
    +    )
    +    assert result.returncode == 0, f"SavePageNow shortcut (public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
    +
    +
    +def test_web_add_requires_login_when_public_off(initialized_archive):
    +    """/web/https://... should bounce to admin when PUBLIC_ADD_VIEW is false and not logged in."""
    +    url = create_test_url(domain="example.com", path="savepagenow-404")
    +    request_url = url
    +    result = _run_savepagenow_not_found_script(initialized_archive, request_url)
    +    assert result.returncode == 0, f"SavePageNow shortcut (no public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
    +
    +
    +def test_web_add_redirects_to_admin_and_creates_when_logged_in(initialized_archive):
    +    """/web/https://... on web host should redirect to admin host and create when the user is logged in there."""
    +    url = create_test_url(domain="example.com", path="savepagenow-web-admin")
    +    result = _run_savepagenow_via_web_host_redirect_script(initialized_archive, url, url)
    +    assert result.returncode == 0, (
    +        f"SavePageNow shortcut (web->admin redirect) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
    +    )
    +
    +
    +def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive):
    +    """/web/https://... should redirect to existing snapshot even when public add is off and not logged in."""
    +    url = create_test_url(domain="example.com", path="savepagenow-existing")
    +    request_url = url.replace("https://", "")
    +    result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url)
    +    assert result.returncode == 0, (
    +        f"SavePageNow shortcut (existing snapshot) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
    +    )
    diff --git a/archivebox/tests/test_schedule.py b/archivebox/tests/test_schedule.py
    new file mode 100644
    index 0000000000..c891f2de4a
    --- /dev/null
    +++ b/archivebox/tests/test_schedule.py
    @@ -0,0 +1,128 @@
    +#!/usr/bin/env python3
    +"""Integration tests for the database-backed archivebox schedule command."""
    +
    +import os
    +import sqlite3
    +import subprocess
    +
    +import pytest
    +
    +
    +def _fetchone(tmp_path, query):
    +    conn = sqlite3.connect(tmp_path / "index.sqlite3")
    +    try:
    +        return conn.execute(query).fetchone()
    +    finally:
    +        conn.close()
    +
    +
    +def test_schedule_creates_enabled_db_schedule(tmp_path, process):
    +    os.chdir(tmp_path)
    +
    +    result = subprocess.run(
    +        ["archivebox", "schedule", "--every=daily", "--depth=1", "https://example.com/feed.xml"],
    +        capture_output=True,
    +        text=True,
    +    )
    +
    +    assert result.returncode == 0
    +
    +    schedule_row = _fetchone(
    +        tmp_path,
    +        "SELECT schedule, is_enabled, label FROM crawls_crawlschedule ORDER BY created_at DESC LIMIT 1",
    +    )
    +    crawl_row = _fetchone(
    +        tmp_path,
    +        "SELECT urls, status, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
    +    )
    +
    +    assert schedule_row == ("daily", 1, "Scheduled import: https://example.com/feed.xml")
    +    assert crawl_row == ("https://example.com/feed.xml", "sealed", 1)
    +
    +
    +def test_schedule_show_lists_enabled_schedules(tmp_path, process):
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "schedule", "--every=weekly", "https://example.com/feed.xml"],
    +        capture_output=True,
    +        text=True,
    +        check=True,
    +    )
    +
    +    result = subprocess.run(
    +        ["archivebox", "schedule", "--show"],
    +        capture_output=True,
    +        text=True,
    +    )
    +
    +    assert result.returncode == 0
    +    assert "Active scheduled crawls" in result.stdout
    +    assert "https://example.com/feed.xml" in result.stdout
    +    assert "weekly" in result.stdout
    +
    +
    +def test_schedule_clear_disables_existing_schedules(tmp_path, process):
    +    os.chdir(tmp_path)
    +
    +    subprocess.run(
    +        ["archivebox", "schedule", "--every=daily", "https://example.com/feed.xml"],
    +        capture_output=True,
    +        text=True,
    +        check=True,
    +    )
    +
    +    result = subprocess.run(
    +        ["archivebox", "schedule", "--clear"],
    +        capture_output=True,
    +        text=True,
    +    )
    +
    +    assert result.returncode == 0
    +    assert "Disabled 1 scheduled crawl" in result.stdout
    +
    +    disabled_count = _fetchone(
    +        tmp_path,
    +        "SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 0",
    +    )[0]
    +    enabled_count = _fetchone(
    +        tmp_path,
    +        "SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 1",
    +    )[0]
    +
    +    assert disabled_count == 1
    +    assert enabled_count == 0
    +
    +
    +def test_schedule_every_requires_valid_period(tmp_path, process):
    +    os.chdir(tmp_path)
    +
    +    result = subprocess.run(
    +        ["archivebox", "schedule", "--every=invalid_period", "https://example.com/feed.xml"],
    +        capture_output=True,
    +        text=True,
    +    )
    +
    +    assert result.returncode != 0
    +    assert "Invalid schedule" in result.stderr or "Invalid schedule" in result.stdout
    +
    +
    +class TestScheduleCLI:
    +    def test_cli_help(self, tmp_path, process):
    +        os.chdir(tmp_path)
    +
    +        result = subprocess.run(
    +            ["archivebox", "schedule", "--help"],
    +            capture_output=True,
    +            text=True,
    +        )
    +
    +        assert result.returncode == 0
    +        assert "--every" in result.stdout
    +        assert "--show" in result.stdout
    +        assert "--clear" in result.stdout
    +        assert "--run-all" in result.stdout
    +
    +
    +if __name__ == "__main__":
    +    pytest.main([__file__, "-v"])
    diff --git a/archivebox/tests/test_schedule_e2e.py b/archivebox/tests/test_schedule_e2e.py
    new file mode 100644
    index 0000000000..7b4b6c1b0b
    --- /dev/null
    +++ b/archivebox/tests/test_schedule_e2e.py
    @@ -0,0 +1,423 @@
    +#!/usr/bin/env python3
    +"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
    +
    +import os
    +import socket
    +import sqlite3
    +import subprocess
    +import sys
    +import textwrap
    +import time
    +from pathlib import Path
    +
    +import pytest
    +import requests
    +
    +from .conftest import run_python_cwd
    +
    +
    +REPO_ROOT = Path(__file__).resolve().parents[2]
    +
    +
    +def init_archive(cwd: Path) -> None:
    +    result = subprocess.run(
    +        [sys.executable, "-m", "archivebox", "init", "--quick"],
    +        cwd=cwd,
    +        capture_output=True,
    +        text=True,
    +        timeout=60,
    +    )
    +    assert result.returncode == 0, result.stderr
    +
    +
    +def build_test_env(port: int, **extra: str) -> dict[str, str]:
    +    env = os.environ.copy()
    +    env.pop("DATA_DIR", None)
    +    env.update(
    +        {
    +            "PLUGINS": "wget",
    +            "LISTEN_HOST": f"archivebox.localhost:{port}",
    +            "ALLOWED_HOSTS": "*",
    +            "CSRF_TRUSTED_ORIGINS": f"http://admin.archivebox.localhost:{port}",
    +            "PUBLIC_ADD_VIEW": "True",
    +            "USE_COLOR": "False",
    +            "SHOW_PROGRESS": "False",
    +            "TIMEOUT": "30",
    +            "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
    +            "SAVE_ARCHIVEDOTORG": "False",
    +            "SAVE_TITLE": "False",
    +            "SAVE_FAVICON": "False",
    +            "SAVE_WARC": "False",
    +            "SAVE_PDF": "False",
    +            "SAVE_SCREENSHOT": "False",
    +            "SAVE_DOM": "False",
    +            "SAVE_SINGLEFILE": "False",
    +            "SAVE_READABILITY": "False",
    +            "SAVE_MERCURY": "False",
    +            "SAVE_GIT": "False",
    +            "SAVE_YTDLP": "False",
    +            "SAVE_HEADERS": "False",
    +            "SAVE_HTMLTOTEXT": "False",
    +            "SAVE_WGET": "True",
    +            "USE_CHROME": "False",
    +        },
    +    )
    +    env.update(extra)
    +    return env
    +
    +
    +def get_free_port() -> int:
    +    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
    +        sock.bind(("127.0.0.1", 0))
    +        return sock.getsockname()[1]
    +
    +
    +def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
    +    result = subprocess.run(
    +        [sys.executable, "-m", "archivebox", "server", "--daemonize", f"127.0.0.1:{port}"],
    +        cwd=cwd,
    +        capture_output=True,
    +        text=True,
    +        env=env,
    +        timeout=60,
    +    )
    +    assert result.returncode == 0, result.stderr
    +
    +
    +def stop_server(cwd: Path) -> None:
    +    script = textwrap.dedent(
    +        """
    +        import os
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
    +        import django
    +        django.setup()
    +        from archivebox.workers.supervisord_util import stop_existing_supervisord_process
    +        stop_existing_supervisord_process()
    +        print('stopped')
    +        """,
    +    )
    +    run_python_cwd(script, cwd=cwd, timeout=30)
    +
    +
    +def wait_for_http(port: int, host: str, path: str = "/", timeout: int = 30) -> requests.Response:
    +    deadline = time.time() + timeout
    +    last_exc = None
    +    while time.time() < deadline:
    +        try:
    +            response = requests.get(
    +                f"http://127.0.0.1:{port}{path}",
    +                headers={"Host": host},
    +                timeout=2,
    +                allow_redirects=False,
    +            )
    +            if response.status_code < 500:
    +                return response
    +        except requests.RequestException as exc:
    +            last_exc = exc
    +        time.sleep(0.5)
    +    raise AssertionError(f"Timed out waiting for HTTP on {host}: {last_exc}")
    +
    +
    +def make_latest_schedule_due(cwd: Path) -> None:
    +    conn = sqlite3.connect(cwd / "index.sqlite3")
    +    try:
    +        conn.execute(
    +            """
    +            UPDATE crawls_crawl
    +            SET created_at = datetime('now', '-2 day'),
    +                modified_at = datetime('now', '-2 day')
    +            WHERE id = (
    +                SELECT template_id
    +                FROM crawls_crawlschedule
    +                ORDER BY created_at DESC
    +                LIMIT 1
    +            )
    +            """,
    +        )
    +        conn.commit()
    +    finally:
    +        conn.close()
    +
    +
    +def get_snapshot_file_text(cwd: Path, url: str) -> str:
    +    script = textwrap.dedent(
    +        f"""
    +        import os
    +        from pathlib import Path
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
    +        import django
    +        django.setup()
    +
    +        from archivebox.core.models import Snapshot
    +
    +        snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
    +        assert snapshot is not None, 'missing snapshot'
    +        assert snapshot.status == 'sealed', snapshot.status
    +
    +        snapshot_dir = Path(snapshot.output_dir)
    +        candidates = []
    +        preferred_patterns = (
    +            'wget/**/index.html',
    +            'wget/**/*.html',
    +            'trafilatura/content.html',
    +            'trafilatura/content.txt',
    +            'defuddle/content.html',
    +            'defuddle/content.txt',
    +        )
    +        for pattern in preferred_patterns:
    +            for candidate in snapshot_dir.glob(pattern):
    +                if candidate.is_file():
    +                    candidates.append(candidate)
    +
    +        if not candidates:
    +            for candidate in snapshot_dir.rglob('*'):
    +                if not candidate.is_file():
    +                    continue
    +                rel = candidate.relative_to(snapshot_dir)
    +                if rel.parts and rel.parts[0] == 'responses':
    +                    continue
    +                if candidate.suffix not in ('.html', '.htm', '.txt'):
    +                    continue
    +                if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
    +                    continue
    +                candidates.append(candidate)
    +
    +        assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
    +        print(candidates[0].read_text(errors='ignore'))
    +        """,
    +    )
    +    stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
    +    assert code == 0, stderr
    +    return stdout
    +
    +
    +def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
    +    deadline = time.time() + timeout
    +    last_error = None
    +    while time.time() < deadline:
    +        try:
    +            return get_snapshot_file_text(cwd, url)
    +        except AssertionError as err:
    +            last_error = err
    +            time.sleep(2)
    +    raise AssertionError(f"timed out waiting for captured content for {url}: {last_error}")
    +
    +
    +def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
    +    conn = sqlite3.connect(cwd / "index.sqlite3")
    +    try:
    +        scheduled_snapshots = conn.execute(
    +            "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
    +            (scheduled_url,),
    +        ).fetchone()[0]
    +        one_shot_snapshots = conn.execute(
    +            "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
    +            (one_shot_url,),
    +        ).fetchone()[0]
    +        scheduled_crawls = conn.execute(
    +            """
    +            SELECT COUNT(*)
    +            FROM crawls_crawl
    +            WHERE schedule_id IS NOT NULL
    +              AND urls = ?
    +            """,
    +            (scheduled_url,),
    +        ).fetchone()[0]
    +        return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
    +    finally:
    +        conn.close()
    +
    +
    +def create_admin_and_token(cwd: Path) -> str:
    +    script = textwrap.dedent(
    +        """
    +        import os
    +        from datetime import timedelta
    +        from django.utils import timezone
    +
    +        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
    +        import django
    +        django.setup()
    +
    +        from django.contrib.auth import get_user_model
    +        from archivebox.api.models import APIToken
    +
    +        User = get_user_model()
    +        user, _ = User.objects.get_or_create(
    +            username='apitestadmin',
    +            defaults={
    +                'email': 'apitestadmin@example.com',
    +                'is_staff': True,
    +                'is_superuser': True,
    +            },
    +        )
    +        user.is_staff = True
    +        user.is_superuser = True
    +        user.set_password('testpass123')
    +        user.save()
    +
    +        token = APIToken.objects.create(
    +            created_by=user,
    +            expires=timezone.now() + timedelta(days=1),
    +        )
    +        print(token.token)
    +        """,
    +    )
    +    stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
    +    assert code == 0, stderr
    +    return stdout.strip().splitlines()[-1]
    +
    +
    +@pytest.mark.timeout(180)
    +def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
    +    os.chdir(tmp_path)
    +    init_archive(tmp_path)
    +
    +    port = get_free_port()
    +    env = build_test_env(port)
    +
    +    schedule_result = subprocess.run(
    +        [sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", recursive_test_site["root_url"]],
    +        cwd=tmp_path,
    +        capture_output=True,
    +        text=True,
    +        env=env,
    +        timeout=60,
    +    )
    +    assert schedule_result.returncode == 0, schedule_result.stderr
    +    assert "Created scheduled crawl" in schedule_result.stdout
    +
    +    make_latest_schedule_due(tmp_path)
    +
    +    try:
    +        start_server(tmp_path, env=env, port=port)
    +        wait_for_http(port, host=f"web.archivebox.localhost:{port}")
    +        captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180)
    +        assert "Root" in captured_text
    +        assert "About" in captured_text
    +    finally:
    +        stop_server(tmp_path)
    +
    +
    +@pytest.mark.timeout(180)
    +def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
    +    os.chdir(tmp_path)
    +    init_archive(tmp_path)
    +
    +    port = get_free_port()
    +    env = build_test_env(port)
    +    scheduled_url = recursive_test_site["root_url"]
    +    one_shot_url = recursive_test_site["child_urls"][0]
    +
    +    schedule_result = subprocess.run(
    +        [sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", scheduled_url],
    +        cwd=tmp_path,
    +        capture_output=True,
    +        text=True,
    +        env=env,
    +        timeout=60,
    +    )
    +    assert schedule_result.returncode == 0, schedule_result.stderr
    +
    +    make_latest_schedule_due(tmp_path)
    +
    +    add_result = subprocess.run(
    +        [sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=wget", one_shot_url],
    +        cwd=tmp_path,
    +        capture_output=True,
    +        text=True,
    +        env=env,
    +        timeout=120,
    +    )
    +    assert add_result.returncode == 0, add_result.stderr
    +    captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
    +    assert "Deep About" in captured_text or "About" in captured_text
    +
    +    scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
    +    assert one_shot_snapshots >= 1
    +    assert scheduled_snapshots == 0
    +    assert scheduled_crawls == 1  # template only, no materialized scheduled run
    +
    +
    +@pytest.mark.timeout(180)
    +def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
    +    os.chdir(tmp_path)
    +    init_archive(tmp_path)
    +
    +    port = get_free_port()
    +    env = build_test_env(port)
    +    api_token = create_admin_and_token(tmp_path)
    +
    +    try:
    +        start_server(tmp_path, env=env, port=port)
    +        wait_for_http(port, host=f"api.archivebox.localhost:{port}", path="/api/v1/docs")
    +
    +        response = requests.post(
    +            f"http://127.0.0.1:{port}/api/v1/cli/schedule",
    +            headers={
    +                "Host": f"api.archivebox.localhost:{port}",
    +                "X-ArchiveBox-API-Key": api_token,
    +            },
    +            json={
    +                "every": "daily",
    +                "import_path": recursive_test_site["root_url"],
    +                "quiet": True,
    +            },
    +            timeout=10,
    +        )
    +
    +        assert response.status_code == 200, response.text
    +        payload = response.json()
    +        assert payload["success"] is True
    +        assert payload["result_format"] == "json"
    +        assert len(payload["result"]["created_schedule_ids"]) == 1
    +    finally:
    +        stop_server(tmp_path)
    +
    +
    +@pytest.mark.timeout(180)
    +def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
    +    os.chdir(tmp_path)
    +    init_archive(tmp_path)
    +
    +    port = get_free_port()
    +    env = build_test_env(port, PUBLIC_ADD_VIEW="True")
    +
    +    try:
    +        start_server(tmp_path, env=env, port=port)
    +        wait_for_http(port, host=f"web.archivebox.localhost:{port}", path="/add/")
    +
    +        response = requests.post(
    +            f"http://127.0.0.1:{port}/add/",
    +            headers={"Host": f"web.archivebox.localhost:{port}"},
    +            data={
    +                "url": recursive_test_site["root_url"],
    +                "depth": "0",
    +                "schedule": "daily",
    +                "tag": "web-ui",
    +                "notes": "created from web ui",
    +            },
    +            timeout=10,
    +            allow_redirects=False,
    +        )
    +
    +        assert response.status_code in (302, 303), response.text
    +
    +        conn = sqlite3.connect(tmp_path / "index.sqlite3")
    +        try:
    +            row = conn.execute(
    +                """
    +                SELECT cs.schedule, c.urls, c.tags_str
    +                FROM crawls_crawlschedule cs
    +                JOIN crawls_crawl c ON c.schedule_id = cs.id
    +                ORDER BY cs.created_at DESC
    +                LIMIT 1
    +                """,
    +            ).fetchone()
    +        finally:
    +            conn.close()
    +
    +        assert row == ("daily", recursive_test_site["root_url"], "web-ui")
    +    finally:
    +        stop_server(tmp_path)
    diff --git a/archivebox/tests/test_server_security_browser.py b/archivebox/tests/test_server_security_browser.py
    new file mode 100644
    index 0000000000..ef59e9b6c9
    --- /dev/null
    +++ b/archivebox/tests/test_server_security_browser.py
    @@ -0,0 +1,603 @@
    +#!/usr/bin/env python3
    +"""Browser-level security mode tests using the existing Node/Puppeteer runtime."""
    +
    +from __future__ import annotations
    +
    +import json
    +import os
    +import shutil
    +import signal
    +import socket
    +import subprocess
    +import sys
    +import textwrap
    +import time
    +from pathlib import Path
    +from urllib.parse import urlencode
    +
    +import pytest
    +import requests
    +
    +from .conftest import _ensure_puppeteer, _find_cached_chromium, _find_system_browser, run_python_cwd
    +
    +
    +PUPPETEER_PROBE_SCRIPT = """\
    +const fs = require("node:fs");
    +const puppeteer = require("puppeteer");
    +
    +async function login(page, config) {
    +  const result = {
    +    reachable: false,
    +    succeeded: false,
    +    finalUrl: null,
    +    status: null,
    +    error: null,
    +  };
    +
    +  try {
    +    const response = await page.goto(config.adminLoginUrl, {
    +      waitUntil: "networkidle2",
    +      timeout: 15000,
    +    });
    +    result.reachable = true;
    +    result.status = response ? response.status() : null;
    +
    +    const usernameInput = await page.$('input[name="username"]');
    +    const passwordInput = await page.$('input[name="password"]');
    +    if (!usernameInput || !passwordInput) {
    +      result.finalUrl = page.url();
    +      return result;
    +    }
    +
    +    await usernameInput.type(config.username);
    +    await passwordInput.type(config.password);
    +    await Promise.all([
    +      page.waitForNavigation({waitUntil: "networkidle2", timeout: 15000}),
    +      page.click('button[type="submit"], input[type="submit"]'),
    +    ]);
    +
    +    result.finalUrl = page.url();
    +    result.succeeded = !page.url().includes("/admin/login/");
    +    return result;
    +  } catch (error) {
    +    result.error = String(error);
    +    result.finalUrl = page.url();
    +    return result;
    +  }
    +}
    +
    +async function main() {
    +  const config = JSON.parse(fs.readFileSync(0, "utf8"));
    +  const browser = await puppeteer.launch({
    +    executablePath: config.chromePath,
    +    headless: true,
    +    args: [
    +      "--no-sandbox",
    +      "--disable-dev-shm-usage",
    +      "--disable-background-networking",
    +    ],
    +  });
    +
    +  const loginPage = await browser.newPage();
    +  const loginResult = await login(loginPage, config);
    +  await loginPage.close();
    +
    +  const page = await browser.newPage();
    +  const consoleMessages = [];
    +  const requestFailures = [];
    +  page.on("console", (message) => {
    +    consoleMessages.push({type: message.type(), text: message.text()});
    +  });
    +  page.on("pageerror", (error) => {
    +    consoleMessages.push({type: "pageerror", text: String(error)});
    +  });
    +  page.on("requestfailed", (request) => {
    +    requestFailures.push({
    +      url: request.url(),
    +      error: request.failure() ? request.failure().errorText : "unknown",
    +    });
    +  });
    +
    +  const response = await page.goto(config.dangerousUrl, {
    +    waitUntil: "networkidle2",
    +    timeout: 15000,
    +  });
    +
    +  await page.waitForFunction(
    +    () => window.__dangerousScriptRan !== true || window.__probeResults !== undefined,
    +    {timeout: 15000},
    +  );
    +
    +  const pageState = await page.evaluate(() => ({
    +    href: location.href,
    +    scriptRan: window.__dangerousScriptRan === true,
    +    probeResults: window.__probeResults || null,
    +    bodyText: document.body ? document.body.innerText.slice(0, 600) : "",
    +  }));
    +
    +  const output = {
    +    mode: config.mode,
    +    login: loginResult,
    +    dangerousPage: {
    +      status: response ? response.status() : null,
    +      finalUrl: page.url(),
    +      contentSecurityPolicy: response ? response.headers()["content-security-policy"] || null : null,
    +      archiveboxSecurityMode: response ? response.headers()["x-archivebox-security-mode"] || null : null,
    +    },
    +    pageState,
    +    consoleMessages,
    +    requestFailures,
    +  };
    +
    +  console.log(JSON.stringify(output));
    +  await browser.close();
    +}
    +
    +main().catch((error) => {
    +  console.error(String(error));
    +  process.exit(1);
    +});
    +"""
    +
    +
    +def _resolve_browser(shared_lib: Path) -> Path | None:
    +    env_browser = os.environ.get("CHROME_BINARY") or os.environ.get("CHROME_BIN")
    +    if env_browser:
    +        candidate = Path(env_browser).expanduser()
    +        if candidate.exists():
    +            return candidate
    +
    +    cached = _find_cached_chromium(shared_lib)
    +    if cached and cached.exists():
    +        return cached
    +
    +    system = _find_system_browser()
    +    if system and system.exists():
    +        return system
    +
    +    which_candidates = ("chromium", "chromium-browser", "google-chrome", "google-chrome-stable", "chrome")
    +    for binary in which_candidates:
    +        resolved = shutil.which(binary)
    +        if resolved:
    +            return Path(resolved)
    +
    +    mac_candidates = (
    +        Path("/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
    +        Path("/Applications/Chromium.app/Contents/MacOS/Chromium"),
    +    )
    +    for candidate in mac_candidates:
    +        if candidate.exists():
    +            return candidate
    +
    +    return None
    +
    +
    +@pytest.fixture(scope="session")
    +def browser_runtime(tmp_path_factory):
    +    assert shutil.which("node") is not None, "Node.js is required for browser security tests"
    +    assert shutil.which("npm") is not None, "npm is required for browser security tests"
    +
    +    shared_lib = tmp_path_factory.mktemp("archivebox_browser_lib")
    +    _ensure_puppeteer(shared_lib)
    +
    +    browser = _resolve_browser(shared_lib)
    +    assert browser, "No Chrome/Chromium binary available for browser security tests"
    +
    +    return {
    +        "node_modules_dir": shared_lib / "npm" / "node_modules",
    +        "chrome_binary": browser,
    +    }
    +
    +
    +def _seed_archive(data_dir: Path) -> dict[str, object]:
    +    script = textwrap.dedent(
    +        """
    +        import json
    +        import os
    +        from pathlib import Path
    +        from django.utils import timezone
    +
    +        os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.core.settings")
    +        import django
    +        django.setup()
    +
    +        from django.contrib.auth import get_user_model
    +        from archivebox.core.models import Snapshot
    +        from archivebox.crawls.models import Crawl
    +
    +        User = get_user_model()
    +        admin, _ = User.objects.get_or_create(
    +            username="testadmin",
    +            defaults={"email": "admin@example.com", "is_staff": True, "is_superuser": True},
    +        )
    +        admin.set_password("testpassword")
    +        admin.save()
    +
    +        snapshots = {}
    +        fixture_specs = (
    +            ("attacker", "https://attacker.example/entry", "Attacker Snapshot", "ATTACKER_SECRET"),
    +            ("victim", "https://victim.example/private", "Victim Snapshot", "VICTIM_SECRET"),
    +        )
    +
    +        for slug, url, title, secret in fixture_specs:
    +            crawl = Crawl.objects.create(
    +                urls=url,
    +                created_by=admin,
    +                status=Crawl.StatusChoices.SEALED,
    +                retry_at=timezone.now(),
    +            )
    +            snapshot = Snapshot.objects.create(
    +                url=url,
    +                title=title,
    +                crawl=crawl,
    +                status=Snapshot.StatusChoices.SEALED,
    +                downloaded_at=timezone.now(),
    +            )
    +            output_dir = Path(snapshot.output_dir)
    +            output_dir.mkdir(parents=True, exist_ok=True)
    +            (output_dir / "safe.json").write_text(
    +                json.dumps({"slug": slug, "secret": secret}),
    +                encoding="utf-8",
    +            )
    +            if slug == "attacker":
    +                (output_dir / "dangerous.html").write_text(
    +                    '''
    +                    
    +                    
    +                      
    +                        

    Dangerous Replay Fixture

    + + + + ''', + encoding="utf-8", + ) + snapshots[slug] = { + "id": str(snapshot.id), + "domain": snapshot.domain, + } + + print(json.dumps({ + "username": "testadmin", + "password": "testpassword", + "snapshots": snapshots, + })) + """, + ) + stdout, stderr, returncode = run_python_cwd(script, cwd=data_dir, timeout=120) + assert returncode == 0, stderr + return json.loads(stdout.strip()) + + +def _get_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return sock.getsockname()[1] + + +def _wait_for_http( + port: int, + host: str, + timeout: float = 30.0, + process: subprocess.Popen[str] | None = None, +) -> None: + deadline = time.time() + timeout + last_error = "server did not answer" + while time.time() < deadline: + if process is not None and process.poll() is not None: + raise AssertionError(f"Server exited before becoming ready with code {process.returncode}") + try: + response = requests.get( + f"http://127.0.0.1:{port}/", + headers={"Host": host}, + timeout=2, + allow_redirects=False, + ) + if response.status_code < 500: + return + last_error = f"HTTP {response.status_code}" + except requests.RequestException as exc: + last_error = str(exc) + time.sleep(0.5) + raise AssertionError(f"Timed out waiting for {host}: {last_error}") + + +def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[str]: + env = os.environ.copy() + env.pop("DATA_DIR", None) + env.update( + { + "PYTHONPATH": str(Path(__file__).resolve().parents[2]), + "LISTEN_HOST": f"archivebox.localhost:{port}", + "ALLOWED_HOSTS": "*", + "CSRF_TRUSTED_ORIGINS": f"http://archivebox.localhost:{port},http://admin.archivebox.localhost:{port}", + "SERVER_SECURITY_MODE": mode, + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", + "SAVE_ARCHIVEDOTORG": "False", + "SAVE_TITLE": "False", + "SAVE_FAVICON": "False", + "SAVE_WGET": "False", + "SAVE_WARC": "False", + "SAVE_PDF": "False", + "SAVE_SCREENSHOT": "False", + "SAVE_DOM": "False", + "SAVE_SINGLEFILE": "False", + "SAVE_READABILITY": "False", + "SAVE_MERCURY": "False", + "SAVE_GIT": "False", + "SAVE_YTDLP": "False", + "SAVE_HEADERS": "False", + "SAVE_HTMLTOTEXT": "False", + "USE_CHROME": "False", + }, + ) + process = subprocess.Popen( + [sys.executable, "-m", "archivebox", "server", "--debug", "--nothreading", f"127.0.0.1:{port}"], + cwd=data_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + start_new_session=True, + ) + try: + _wait_for_http(port, f"archivebox.localhost:{port}", process=process) + except AssertionError as exc: + server_log = _stop_server(process) + raise AssertionError(f"{exc}\n\nSERVER LOG:\n{server_log}") from exc + return process + + +def _stop_server(process: subprocess.Popen[str]) -> str: + try: + if process.poll() is None: + os.killpg(process.pid, signal.SIGTERM) + try: + stdout, _ = process.communicate(timeout=3) + except subprocess.TimeoutExpired: + os.killpg(process.pid, signal.SIGKILL) + stdout, _ = process.communicate(timeout=5) + else: + stdout, _ = process.communicate(timeout=5) + except ProcessLookupError: + stdout, _ = process.communicate(timeout=5) + return stdout + + +def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtime: dict[str, Path]) -> dict[str, str]: + snapshots = fixture["snapshots"] + attacker = snapshots["attacker"] + victim = snapshots["victim"] + base_origin = f"http://archivebox.localhost:{port}" + attacker_id = attacker["id"] + victim_id = victim["id"] + + if mode == "safe-subdomains-fullreplay": + attacker_origin = f"http://{attacker_id}.archivebox.localhost:{port}" + victim_url = f"http://{victim_id}.archivebox.localhost:{port}/safe.json" + dangerous_base = f"{attacker_origin}/dangerous.html" + admin_origin = f"http://admin.archivebox.localhost:{port}" + else: + attacker_origin = base_origin + victim_url = f"{base_origin}/snapshot/{victim_id}/safe.json" + dangerous_base = f"{base_origin}/snapshot/{attacker_id}/dangerous.html" + admin_origin = base_origin + + query = urlencode( + { + "own": "safe.json", + "victim": victim_url, + "admin": f"{admin_origin}/admin/", + "api": f"{admin_origin}/api/v1/docs", + }, + ) + + return { + "mode": mode, + "chromePath": str(runtime["chrome_binary"]), + "adminLoginUrl": f"{admin_origin}/admin/login/", + "dangerousUrl": f"{dangerous_base}?{query}", + "username": fixture["username"], + "password": fixture["password"], + } + + +def _run_browser_probe( + data_dir: Path, + runtime: dict[str, Path], + mode: str, + fixture: dict[str, object], + tmp_path: Path, +) -> dict[str, object]: + port = _get_free_port() + process = _start_server(data_dir, mode=mode, port=port) + probe_path = tmp_path / "server_security_probe.js" + probe_path.write_text(PUPPETEER_PROBE_SCRIPT, encoding="utf-8") + probe_config = _build_probe_config(mode, port, fixture, runtime) + + env = os.environ.copy() + env["NODE_PATH"] = str(runtime["node_modules_dir"]) + env["NODE_MODULES_DIR"] = str(runtime["node_modules_dir"]) + env["CHROME_BINARY"] = str(runtime["chrome_binary"]) + env["USE_COLOR"] = "False" + + try: + result = subprocess.run( + ["node", str(probe_path)], + cwd=data_dir, + env=env, + input=json.dumps(probe_config), + capture_output=True, + text=True, + timeout=120, + ) + finally: + server_log = _stop_server(process) + + assert result.returncode == 0, f"{result.stderr}\n\nSERVER LOG:\n{server_log}" + return json.loads(result.stdout.strip()) + + +@pytest.mark.parametrize( + ("mode", "expected"), + [ + ( + "safe-subdomains-fullreplay", + { + "login_succeeds": True, + "script_ran": True, + "victim_ok": False, + "admin_ok": False, + "admin_status": None, + "api_ok": False, + "api_status": None, + "csp_contains": None, + }, + ), + ( + "safe-onedomain-nojsreplay", + { + "login_succeeds": True, + "script_ran": False, + "victim_ok": None, + "admin_ok": None, + "admin_status": None, + "api_ok": None, + "api_status": None, + "csp_contains": "sandbox", + }, + ), + ( + "unsafe-onedomain-noadmin", + { + "login_succeeds": False, + "login_status": 403, + "script_ran": True, + "victim_ok": True, + "victim_status": 200, + "admin_ok": True, + "admin_status": 403, + "api_ok": True, + "api_status": 403, + "csp_contains": None, + }, + ), + ( + "danger-onedomain-fullreplay", + { + "login_succeeds": True, + "script_ran": True, + "victim_ok": True, + "victim_status": 200, + "admin_ok": True, + "admin_status": 200, + "api_ok": True, + "api_status": 200, + "csp_contains": None, + }, + ), + ], +) +def test_server_security_modes_in_chrome( + initialized_archive: Path, + browser_runtime, + tmp_path: Path, + mode: str, + expected: dict[str, object], +) -> None: + fixture = _seed_archive(initialized_archive) + result = _run_browser_probe(initialized_archive, browser_runtime, mode, fixture, tmp_path) + + login = result["login"] + dangerous_page = result["dangerousPage"] + page_state = result["pageState"] + probe_results = page_state["probeResults"] or {} + console_texts = [entry["text"] for entry in result["consoleMessages"]] + + assert dangerous_page["status"] == 200 + assert dangerous_page["archiveboxSecurityMode"] == mode + assert page_state["scriptRan"] is expected["script_ran"] + assert login["succeeded"] is expected["login_succeeds"] + + login_status = expected.get("login_status") + if login_status is not None: + assert login["status"] == login_status + + csp_contains = expected.get("csp_contains") + if csp_contains: + csp = dangerous_page["contentSecurityPolicy"] or "" + assert csp_contains in csp + else: + assert dangerous_page["contentSecurityPolicy"] is None + + if mode == "safe-subdomains-fullreplay": + assert probe_results["own"]["ok"] is True + assert probe_results["own"]["status"] == 200 + assert "ATTACKER_SECRET" in probe_results["own"]["sample"] + assert probe_results["victim"]["ok"] is expected["victim_ok"] + assert probe_results["admin"]["ok"] is expected["admin_ok"] + assert probe_results["api"]["ok"] is expected["api_ok"] + assert any("CORS policy" in text for text in console_texts) + return + + if mode == "safe-onedomain-nojsreplay": + assert probe_results == {} + assert "Dangerous Replay Fixture" in page_state["bodyText"] + assert any("Blocked script execution" in text for text in console_texts) + return + + assert probe_results["own"]["ok"] is True + assert probe_results["own"]["status"] == 200 + assert "ATTACKER_SECRET" in probe_results["own"]["sample"] + assert probe_results["victim"]["ok"] is expected["victim_ok"] + assert probe_results["victim"]["status"] == expected["victim_status"] + assert "VICTIM_SECRET" in probe_results["victim"]["sample"] + assert probe_results["admin"]["ok"] is expected["admin_ok"] + assert probe_results["admin"]["status"] == expected["admin_status"] + assert probe_results["api"]["ok"] is expected["api_ok"] + assert probe_results["api"]["status"] == expected["api_status"] + + if mode == "unsafe-onedomain-noadmin": + assert "control plane disabled" in probe_results["admin"]["sample"].lower() + assert "control plane disabled" in probe_results["api"]["sample"].lower() + elif mode == "danger-onedomain-fullreplay": + assert "ArchiveBox" in probe_results["admin"]["sample"] + assert "swagger" in probe_results["api"]["sample"].lower() diff --git a/archivebox/tests/test_settings_signal_webhooks.py b/archivebox/tests/test_settings_signal_webhooks.py new file mode 100644 index 0000000000..acb6367dc5 --- /dev/null +++ b/archivebox/tests/test_settings_signal_webhooks.py @@ -0,0 +1,8 @@ +from django.test import TestCase + + +class TestSignalWebhooksSettings(TestCase): + def test_task_handler_is_sync_in_tests(self): + from signal_webhooks.settings import webhook_settings + + assert webhook_settings.TASK_HANDLER.__name__ == "sync_task_handler" diff --git a/archivebox/tests/test_snapshot.py b/archivebox/tests/test_snapshot.py new file mode 100644 index 0000000000..ac8609d50a --- /dev/null +++ b/archivebox/tests/test_snapshot.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +"""Integration tests for archivebox snapshot command.""" + +import os +import subprocess +import sqlite3 +from archivebox.machine.models import Process +from datetime import datetime +from urllib.parse import urlparse +import uuid + +import pytest + + +def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict): + """Test that snapshot stores the exact URL in the database.""" + os.chdir(tmp_path) + + subprocess.run( + ["archivebox", "snapshot", "create", "https://example.com"], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot_row = c.execute( + "SELECT id, created_at, url, crawl_id FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone() + assert snapshot_row is not None + crawl_row = c.execute( + "SELECT id, created_at, urls, created_by_id FROM crawls_crawl WHERE id = ?", + (snapshot_row[3],), + ).fetchone() + assert crawl_row is not None + user_row = c.execute( + "SELECT username FROM auth_user WHERE id = ?", + (crawl_row[3],), + ).fetchone() + assert user_row is not None + conn.close() + + snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row + snapshot_id = str(uuid.UUID(snapshot_id_raw)) + username = user_row[0] + snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime("%Y%m%d") + domain = urlparse(snapshot_url).hostname or "unknown" + + # Verify crawl symlink exists and is relative + target_path = tmp_path / "users" / username / "snapshots" / snapshot_date_str / domain / snapshot_id + symlinks = [p for p in tmp_path.rglob(str(snapshot_id)) if p.is_symlink()] + assert symlinks, "Snapshot symlink should exist under crawl dir" + link_path = symlinks[0] + + assert link_path.is_symlink(), "Snapshot symlink should exist under crawl dir" + link_target = os.readlink(link_path) + assert not os.path.isabs(link_target), "Symlink should be relative" + assert link_path.resolve() == target_path.resolve() + + +def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disable_extractors_dict): + """Test that multiple URLs each get their own snapshot record.""" + os.chdir(tmp_path) + + subprocess.run( + [ + "archivebox", + "snapshot", + "create", + "https://example.com", + "https://iana.org", + ], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall() + conn.close() + + urls = [u[0] for u in urls] + assert "https://example.com" in urls + assert "https://iana.org" in urls + assert len(urls) >= 2 + + +def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disable_extractors_dict): + """Test that --tag creates tag record and links it to the snapshot.""" + os.chdir(tmp_path) + + subprocess.run( + [ + "archivebox", + "snapshot", + "create", + "--tag=mytesttag", + "https://example.com", + ], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Verify tag was created + tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ("mytesttag",)).fetchone() + assert tag is not None, "Tag 'mytesttag' should exist in core_tag" + tag_id = tag[0] + + # Verify snapshot exists + snapshot = c.execute( + "SELECT id FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone() + assert snapshot is not None + snapshot_id = snapshot[0] + + # Verify tag is linked to snapshot via join table + link = c.execute( + """ + SELECT * FROM core_snapshot_tags + WHERE snapshot_id = ? AND tag_id = ? + """, + (snapshot_id, tag_id), + ).fetchone() + conn.close() + + assert link is not None, "Tag should be linked to snapshot via core_snapshot_tags" + + +def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_extractors_dict): + """Test that JSONL output contains required fields with correct types.""" + os.chdir(tmp_path) + + # Pass URL as argument instead of stdin for more reliable behavior + result = subprocess.run( + ["archivebox", "snapshot", "create", "https://example.com"], + capture_output=True, + text=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + # Parse JSONL output lines + records = Process.parse_records_from_text(result.stdout) + snapshot_records = [r for r in records if r.get("type") == "Snapshot"] + + assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record" + + record = snapshot_records[0] + assert record.get("type") == "Snapshot" + assert "id" in record, "Snapshot record should have 'id' field" + assert "url" in record, "Snapshot record should have 'url' field" + assert record["url"] == "https://example.com" + + +def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors_dict): + """Test that title is stored when provided via tag option.""" + os.chdir(tmp_path) + + # Use command line args instead of stdin + subprocess.run( + ["archivebox", "snapshot", "create", "--tag=customtag", "https://example.com"], + capture_output=True, + text=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + + # Verify tag was created with correct name + tag = c.execute( + "SELECT name FROM core_tag WHERE name = ?", + ("customtag",), + ).fetchone() + conn.close() + + assert tag is not None + assert tag[0] == "customtag" + + +def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extractors_dict): + """Test that --depth sets snapshot depth when creating snapshots.""" + os.chdir(tmp_path) + + subprocess.run( + [ + "archivebox", + "snapshot", + "create", + "--depth=1", + "https://example.com", + ], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + snapshot = c.execute("SELECT depth FROM core_snapshot ORDER BY created_at DESC LIMIT 1").fetchone() + conn.close() + + assert snapshot is not None, "Snapshot should be created when depth is provided" + assert snapshot[0] == 1, "Snapshot depth should match --depth value" + + +def test_snapshot_allows_duplicate_urls_across_crawls(tmp_path, process, disable_extractors_dict): + """Snapshot create auto-creates a crawl per run; same URL can appear multiple times.""" + os.chdir(tmp_path) + + # Add same URL twice + subprocess.run( + ["archivebox", "snapshot", "create", "https://example.com"], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + subprocess.run( + ["archivebox", "snapshot", "create", "https://example.com"], + capture_output=True, + env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)}, + ) + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute( + "SELECT COUNT(*) FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone()[0] + conn.close() + + assert count == 2, "Same URL should create separate snapshots across different crawls" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/archivebox/tests/test_tag_admin.py b/archivebox/tests/test_tag_admin.py new file mode 100644 index 0000000000..e8ca7bde3f --- /dev/null +++ b/archivebox/tests/test_tag_admin.py @@ -0,0 +1,204 @@ +import json +from datetime import datetime +from typing import cast + +import pytest +from django.contrib.auth import get_user_model +from django.contrib.auth.models import UserManager +from django.urls import reverse +from django.utils import timezone + + +pytestmark = pytest.mark.django_db + + +User = get_user_model() +ADMIN_HOST = "admin.archivebox.localhost:8000" + + +@pytest.fixture +def admin_user(db): + return cast(UserManager, User.objects).create_superuser( + username="tagadmin", + email="tagadmin@test.com", + password="testpassword", + ) + + +@pytest.fixture +def api_token(admin_user): + from archivebox.api.auth import get_or_create_api_token + + token = get_or_create_api_token(admin_user) + assert token is not None + return token.token + + +@pytest.fixture +def crawl(admin_user): + from archivebox.crawls.models import Crawl + + return Crawl.objects.create( + urls="https://example.com", + created_by=admin_user, + ) + + +@pytest.fixture +def tagged_data(crawl, admin_user): + from archivebox.core.models import Snapshot, Tag + + tag = Tag.objects.create(name="Alpha Research", created_by=admin_user) + first = Snapshot.objects.create( + url="https://example.com/one", + title="Example One", + crawl=crawl, + ) + second = Snapshot.objects.create( + url="https://example.com/two", + title="Example Two", + crawl=crawl, + ) + first.tags.add(tag) + second.tags.add(tag) + return tag, [first, second] + + +def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data): + client.login(username="tagadmin", password="testpassword") + + response = client.get(reverse("admin:core_tag_changelist"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b'id="tag-live-search"' in response.content + assert b'id="tag-sort-select"' in response.content + assert b'id="tag-created-by-select"' in response.content + assert b'id="tag-year-select"' in response.content + assert b"Alpha Research" in response.content + assert b'class="tag-card"' in response.content + + +def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user): + client.login(username="tagadmin", password="testpassword") + + response = client.get(reverse("admin:core_tag_add"), HTTP_HOST=ADMIN_HOST) + + assert response.status_code == 200 + assert b"Similar Tags" in response.content + assert b'data-tag-name-input="1"' in response.content + + +def test_tag_search_api_returns_card_payload(client, api_token, tagged_data): + tag, snapshots = tagged_data + + response = client.get( + reverse("api-1:search_tags"), + {"q": "Alpha", "api_key": api_token}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["sort"] == "created_desc" + assert payload["created_by"] == "" + assert payload["year"] == "" + assert payload["has_snapshots"] == "all" + assert payload["tags"][0]["id"] == tag.id + assert payload["tags"][0]["name"] == "Alpha Research" + assert payload["tags"][0]["num_snapshots"] == 2 + assert payload["tags"][0]["snapshots"][0]["title"] in {"Example One", "Example Two"} + assert payload["tags"][0]["export_jsonl_url"].endswith(f"/api/v1/core/tag/{tag.id}/snapshots.jsonl") + assert payload["tags"][0]["filter_url"].endswith(f"/admin/core/snapshot/?tags__id__exact={tag.id}") + assert {snapshot["url"] for snapshot in payload["tags"][0]["snapshots"]} == {snap.url for snap in snapshots} + + +def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data): + from archivebox.core.models import Snapshot, Tag + + other_user = cast(UserManager, User.objects).create_user( + username="tagother", + email="tagother@test.com", + password="unused", + ) + tag_with_snapshots = tagged_data[0] + empty_tag = Tag.objects.create(name="Zulu Empty", created_by=other_user) + alpha_tag = Tag.objects.create(name="Alpha Empty", created_by=other_user) + Snapshot.objects.create( + url="https://example.com/three", + title="Example Three", + crawl=crawl, + ).tags.add(alpha_tag) + + Tag.objects.filter(pk=empty_tag.pk).update(created_at=timezone.make_aware(datetime(2024, 1, 1, 12, 0, 0))) + Tag.objects.filter(pk=alpha_tag.pk).update(created_at=timezone.make_aware(datetime(2025, 1, 1, 12, 0, 0))) + Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0))) + + response = client.get( + reverse("api-1:search_tags"), + { + "sort": "name_desc", + "created_by": str(other_user.pk), + "year": "2024", + "has_snapshots": "no", + "api_key": api_token, + }, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["sort"] == "name_desc" + assert payload["created_by"] == str(other_user.pk) + assert payload["year"] == "2024" + assert payload["has_snapshots"] == "no" + assert [tag["name"] for tag in payload["tags"]] == ["Zulu Empty"] + + +def test_tag_rename_api_updates_slug(client, api_token, tagged_data): + tag, _ = tagged_data + + response = client.post( + f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}", + data=json.dumps({"name": "Alpha Archive"}), + content_type="application/json", + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + + tag.refresh_from_db() + assert tag.name == "Alpha Archive" + assert tag.slug == "alpha-archive" + + +def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data): + tag, _ = tagged_data + + response = client.get( + reverse("api-1:tag_snapshots_export", args=[tag.id]), + {"api_key": api_token}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert response["Content-Type"].startswith("application/x-ndjson") + assert f"tag-{tag.slug}-snapshots.jsonl" in response["Content-Disposition"] + body = response.content.decode() + assert '"type": "Snapshot"' in body + assert '"tags": "Alpha Research"' in body + + +def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data): + tag, snapshots = tagged_data + + response = client.get( + reverse("api-1:tag_urls_export", args=[tag.id]), + {"api_key": api_token}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert response["Content-Type"].startswith("text/plain") + assert f"tag-{tag.slug}-urls.txt" in response["Content-Disposition"] + exported_urls = set(filter(None, response.content.decode().splitlines())) + assert exported_urls == {snapshot.url for snapshot in snapshots} diff --git a/archivebox/tests/test_tag_service.py b/archivebox/tests/test_tag_service.py new file mode 100644 index 0000000000..defa6f4d39 --- /dev/null +++ b/archivebox/tests/test_tag_service.py @@ -0,0 +1,48 @@ +import asyncio + +import pytest + +from abx_dl.events import TagEvent +from abx_dl.orchestrator import create_bus + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _create_snapshot(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + return Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + +def test_tag_event_projects_tag_to_snapshot(): + from archivebox.core.models import Tag + from archivebox.services.tag_service import TagService + + snapshot = _create_snapshot() + bus = create_bus(name="test_tag_service") + TagService(bus) + + async def emit_tag_event() -> None: + await bus.emit( + TagEvent( + name="example", + snapshot_id=str(snapshot.id), + ), + ) + + asyncio.run(emit_tag_event()) + + snapshot.refresh_from_db() + assert snapshot.tags.filter(name="example").exists() + assert Tag.objects.filter(name="example").exists() diff --git a/archivebox/tests/test_test_harness.py b/archivebox/tests/test_test_harness.py new file mode 100644 index 0000000000..22aace9d0d --- /dev/null +++ b/archivebox/tests/test_test_harness.py @@ -0,0 +1,30 @@ +import os +from pathlib import Path + +import pytest + +from archivebox.tests import conftest as test_harness + + +def test_session_data_dir_is_outside_repo_root(): + assert test_harness.SESSION_DATA_DIR != test_harness.REPO_ROOT + assert test_harness.REPO_ROOT not in test_harness.SESSION_DATA_DIR.parents + assert Path.cwd() != test_harness.REPO_ROOT + if test_harness.REPO_ROOT in Path.cwd().parents: + assert test_harness.PYTEST_BASETEMP_ROOT in (Path.cwd(), *Path.cwd().parents) + + +def test_cli_helpers_reject_repo_root_runtime_paths(): + with pytest.raises(AssertionError, match="repo root"): + test_harness.run_archivebox_cmd(["version"], data_dir=test_harness.REPO_ROOT) + + with pytest.raises(AssertionError, match="repo root"): + test_harness.run_archivebox_cmd_cwd(["version"], cwd=test_harness.REPO_ROOT) + + with pytest.raises(AssertionError, match="repo root"): + test_harness.run_python_cwd("print('hello')", cwd=test_harness.REPO_ROOT) + + +def test_runtime_guard_rejects_chdir_into_repo_root(): + with pytest.raises(AssertionError, match="repo root"): + os.chdir(test_harness.REPO_ROOT) diff --git a/archivebox/tests/test_title.py b/archivebox/tests/test_title.py new file mode 100644 index 0000000000..c4b41e4456 --- /dev/null +++ b/archivebox/tests/test_title.py @@ -0,0 +1,56 @@ +import os +import sqlite3 +import subprocess + +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) + + +def test_title_is_extracted(tmp_path, process, disable_extractors_dict): + """Test that title is extracted from the page.""" + disable_extractors_dict.update({"SAVE_TITLE": "true"}) + add_process = subprocess.run( + ["archivebox", "add", "--plugins=title", "https://example.com"], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + assert add_process.returncode == 0, add_process.stderr or add_process.stdout + + os.chdir(tmp_path) + conn = sqlite3.connect("index.sqlite3") + conn.row_factory = sqlite3.Row + c = conn.cursor() + c.execute("SELECT title FROM core_snapshot") + snapshot = c.fetchone() + conn.close() + + assert snapshot[0] is not None + assert "Example" in snapshot[0] + + +def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict): + """ + https://github.com/ArchiveBox/ArchiveBox/issues/330 + Unencoded content should not be rendered as it facilitates xss injections + and breaks the layout. + """ + disable_extractors_dict.update({"SAVE_TITLE": "true"}) + add_process = subprocess.run( + ["archivebox", "add", "--plugins=title", "https://example.com"], + capture_output=True, + text=True, + env=disable_extractors_dict, + ) + assert add_process.returncode == 0, add_process.stderr or add_process.stdout + list_process = subprocess.run( + ["archivebox", "search", "--html"], + capture_output=True, + text=True, + ) + assert list_process.returncode == 0, list_process.stderr or list_process.stdout + + # Should not contain unescaped HTML tags in output + output = list_process.stdout + assert "https://example.com" in output diff --git a/archivebox/tests/test_update.py b/archivebox/tests/test_update.py new file mode 100644 index 0000000000..b32c6d36b3 --- /dev/null +++ b/archivebox/tests/test_update.py @@ -0,0 +1,195 @@ +import json +import sqlite3 +import subprocess +from datetime import datetime, timedelta + +import pytest +from django.utils import timezone + +from .fixtures import disable_extractors_dict, process + +FIXTURES = (disable_extractors_dict, process) + + +def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict): + """Test that archivebox update imports real legacy archive directories.""" + legacy_timestamp = "1710000000" + legacy_dir = tmp_path / "archive" / legacy_timestamp + legacy_dir.mkdir(parents=True, exist_ok=True) + (legacy_dir / "singlefile.html").write_text("example") + (legacy_dir / "index.json").write_text( + json.dumps( + { + "url": "https://example.com", + "timestamp": legacy_timestamp, + "title": "Example Domain", + "fs_version": "0.8.0", + "archive_results": [], + }, + ), + ) + + # Run update without filters - should import and migrate the legacy directory. + update_process = subprocess.run( + ["archivebox", "update"], + capture_output=True, + text=True, + env=disable_extractors_dict, + timeout=60, + ) + assert update_process.returncode == 0, update_process.stderr + + conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) + c = conn.cursor() + row = c.execute("SELECT url, fs_version FROM core_snapshot").fetchone() + conn.commit() + conn.close() + + assert row == ("https://example.com", "0.9.0") + assert legacy_dir.is_symlink() + + migrated_dir = legacy_dir.resolve() + assert migrated_dir.exists() + assert (migrated_dir / "index.jsonl").exists() + assert (migrated_dir / "singlefile.html").exists() + + +@pytest.mark.django_db +def test_reindex_snapshots_resets_existing_search_results_and_reruns_requested_plugins(monkeypatch): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.cli.archivebox_update import reindex_snapshots + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.crawls.models import Crawl + import archivebox.cli.archivebox_extract as extract_mod + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite.py", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="old index hit", + output_json={"indexed": True}, + output_files={"search.sqlite3": {"size": 123}}, + output_size=123, + ) + + captured: dict[str, object] = {} + + def fake_run_plugins(*, args, records, wait, emit_results, plugins=""): + captured["args"] = args + captured["records"] = records + captured["wait"] = wait + captured["emit_results"] = emit_results + captured["plugins"] = plugins + return 0 + + monkeypatch.setattr(extract_mod, "run_plugins", fake_run_plugins) + + stats = reindex_snapshots( + Snapshot.objects.filter(id=snapshot.id), + search_plugins=["search_backend_sqlite"], + batch_size=10, + ) + + result.refresh_from_db() + + assert stats["processed"] == 1 + assert stats["queued"] == 1 + assert stats["reindexed"] == 1 + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert result.output_str == "" + assert result.output_json is None + assert result.output_files == {} + assert captured == { + "args": (), + "records": [{"type": "ArchiveResult", "snapshot_id": str(snapshot.id), "plugin": "search_backend_sqlite"}], + "wait": True, + "emit_results": False, + "plugins": "", + } + + +@pytest.mark.django_db +def test_build_filtered_snapshots_queryset_respects_resume_cutoff(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com\nhttps://example.org\nhttps://example.net", + created_by_id=get_or_create_system_user_pk(), + ) + base = timezone.make_aware(datetime(2026, 3, 23, 12, 0, 0)) + older = Snapshot.objects.create( + url="https://example.net", + crawl=crawl, + bookmarked_at=base - timedelta(hours=2), + ) + middle = Snapshot.objects.create( + url="https://example.org", + crawl=crawl, + bookmarked_at=base - timedelta(hours=1), + ) + newer = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + bookmarked_at=base, + ) + + snapshots = list( + _build_filtered_snapshots_queryset( + filter_patterns=(), + filter_type="exact", + before=None, + after=None, + resume=middle.timestamp, + ).values_list("id", flat=True), + ) + + assert str(newer.id) not in {str(snapshot_id) for snapshot_id in snapshots} + assert set(map(str, snapshots)) == {str(middle.id), str(older.id)} + + +@pytest.mark.django_db +def test_reconcile_with_index_json_tolerates_null_title(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + title="Example Domain", + status=Snapshot.StatusChoices.SEALED, + ) + output_dir = snapshot.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot.url, + "timestamp": snapshot.timestamp, + "title": None, + "archive_results": [], + }, + ), + ) + + snapshot.reconcile_with_index_json() + snapshot.refresh_from_db() + + assert snapshot.title == "Example Domain" diff --git a/archivebox/tests/test_urls.py b/archivebox/tests/test_urls.py new file mode 100644 index 0000000000..37bcbb082e --- /dev/null +++ b/archivebox/tests/test_urls.py @@ -0,0 +1,936 @@ +import os +import sys +import subprocess +import textwrap +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[3] + + +def _merge_pythonpath(env: dict[str, str]) -> dict[str, str]: + env.pop("DATA_DIR", None) + pythonpath = env.get("PYTHONPATH", "") + if pythonpath: + env["PYTHONPATH"] = f"{REPO_ROOT}{os.pathsep}{pythonpath}" + else: + env["PYTHONPATH"] = str(REPO_ROOT) + return env + + +def _run_python(script: str, cwd: Path, timeout: int = 60, env_overrides: dict[str, str] | None = None) -> subprocess.CompletedProcess: + env = _merge_pythonpath(os.environ.copy()) + if env_overrides: + env.update(env_overrides) + return subprocess.run( + [sys.executable, "-"], + cwd=cwd, + env=env, + input=script, + capture_output=True, + text=True, + timeout=timeout, + ) + + +def _build_script(body: str) -> str: + prelude = textwrap.dedent( + """ + import os + from pathlib import Path + + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.core.settings") + import django + django.setup() + + from django.test import Client + from django.contrib.auth import get_user_model + + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.config.common import SERVER_CONFIG + from archivebox.core.host_utils import ( + get_admin_host, + get_admin_base_url, + get_api_host, + get_web_host, + get_web_base_url, + get_public_host, + get_snapshot_subdomain, + get_snapshot_host, + get_original_host, + get_listen_subdomain, + split_host_port, + host_matches, + is_snapshot_subdomain, + build_admin_url, + build_snapshot_url, + ) + + def response_body(resp): + if getattr(resp, "streaming", False): + return b"".join(resp.streaming_content) + return resp.content + + def ensure_admin_user(): + User = get_user_model() + admin, _ = User.objects.get_or_create( + username="testadmin", + defaults={"email": "admin@example.com", "is_staff": True, "is_superuser": True}, + ) + admin.set_password("testpassword") + admin.save() + return admin + + def get_snapshot(): + snapshot = Snapshot.objects.order_by("-created_at").first() + assert snapshot is not None, "Expected real_archive_with_example to seed a snapshot" + return snapshot + + def get_snapshot_files(snapshot): + output_rel = None + reserved_snapshot_paths = {"index.html"} + for output in snapshot.discover_outputs(): + candidate = output.get("path") + if not candidate: + continue + if candidate.startswith("responses/"): + continue + if Path(snapshot.output_dir, candidate).is_file(): + output_rel = candidate + break + if output_rel is None: + fallback = Path(snapshot.output_dir, "index.jsonl") + if fallback.exists(): + output_rel = "index.jsonl" + assert output_rel is not None + + responses_root = Path(snapshot.output_dir) / "responses" / snapshot.domain + assert responses_root.exists() + response_file = None + response_rel = None + for candidate in responses_root.rglob("*"): + if not candidate.is_file(): + continue + rel = candidate.relative_to(responses_root) + if str(rel) in reserved_snapshot_paths: + continue + if not (Path(snapshot.output_dir) / rel).exists(): + response_file = candidate + response_rel = str(rel) + break + if response_file is None: + for candidate in responses_root.rglob("*"): + if not candidate.is_file(): + continue + rel = candidate.relative_to(responses_root) + if str(rel) in reserved_snapshot_paths: + continue + response_file = candidate + response_rel = str(rel) + break + if response_file is None: + response_file = next(p for p in responses_root.rglob("*") if p.is_file()) + response_rel = str(response_file.relative_to(responses_root)) + response_output_path = Path(snapshot.output_dir) / response_rel + return output_rel, response_file, response_rel, response_output_path + + def write_replay_fixtures(snapshot): + dangerous_html = Path(snapshot.output_dir) / "dangerous.html" + dangerous_html.write_text( + "

    Danger

    ", + encoding="utf-8", + ) + safe_json = Path(snapshot.output_dir) / "safe.json" + safe_json.write_text('{"ok": true}', encoding="utf-8") + responses_root = Path(snapshot.output_dir) / "responses" / snapshot.domain + responses_root.mkdir(parents=True, exist_ok=True) + sniffed_response = responses_root / "dangerous-response" + sniffed_response.write_text( + "

    Response Danger

    ", + encoding="utf-8", + ) + return "dangerous.html", "safe.json", "dangerous-response" + """, + ) + return prelude + "\n" + textwrap.dedent(body) + + +class TestUrlRouting: + data_dir: Path + + @pytest.fixture(autouse=True) + def _setup_data_dir(self, real_archive_with_example: Path) -> None: + self.data_dir = real_archive_with_example + + def _run( + self, + body: str, + timeout: int = 120, + mode: str | None = None, + env_overrides: dict[str, str] | None = None, + ) -> None: + script = _build_script(body) + merged_env = dict(env_overrides or {}) + if mode: + merged_env["SERVER_SECURITY_MODE"] = mode + result = _run_python( + script, + cwd=self.data_dir, + timeout=timeout, + env_overrides=merged_env or None, + ) + assert result.returncode == 0, result.stderr + assert "OK" in result.stdout + + def test_host_utils_and_public_redirect(self) -> None: + self._run( + """ + snapshot = get_snapshot() + snapshot_id = str(snapshot.id) + domain = snapshot.domain + + web_host = get_web_host() + admin_host = get_admin_host() + api_host = get_api_host() + public_host = get_public_host() + snapshot_subdomain = get_snapshot_subdomain(snapshot_id) + snapshot_host = get_snapshot_host(snapshot_id) + original_host = get_original_host(domain) + base_host = SERVER_CONFIG.LISTEN_HOST + + host_only, port = split_host_port(base_host) + assert host_only == "archivebox.localhost" + assert port == "8000" + assert web_host == "web.archivebox.localhost:8000" + assert admin_host == "admin.archivebox.localhost:8000" + assert api_host == "api.archivebox.localhost:8000" + assert public_host == "public.archivebox.localhost:8000" + assert snapshot_subdomain == f"snap-{snapshot_id[-12:].lower()}" + assert snapshot_host == f"{snapshot_subdomain}.archivebox.localhost:8000" + assert original_host == f"{domain}.archivebox.localhost:8000" + assert get_listen_subdomain(web_host) == "web" + assert get_listen_subdomain(admin_host) == "admin" + assert get_listen_subdomain(api_host) == "api" + assert get_listen_subdomain(snapshot_host) == snapshot_subdomain + assert get_listen_subdomain(original_host) == domain + assert get_listen_subdomain(base_host) == "" + assert host_matches(web_host, get_web_host()) + assert is_snapshot_subdomain(snapshot_subdomain) + assert is_snapshot_subdomain(snapshot_id) + + client = Client() + resp = client.get("/public.html", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert resp["Location"].endswith("/public/") + + resp = client.get("/public/", HTTP_HOST=base_host) + assert resp.status_code in (301, 302) + assert resp["Location"].startswith(f"http://{web_host}/public/") + + resp = client.get("/", HTTP_HOST=api_host) + assert resp.status_code in (301, 302) + assert resp["Location"].startswith("/api/") + + print("OK") + """, + ) + + def test_web_admin_routing(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + client = Client() + web_host = get_web_host() + public_host = get_public_host() + admin_host = get_admin_host() + snapshot_host = get_snapshot_host(str(snapshot.id)) + original_host = get_original_host(snapshot.domain) + + resp = client.get("/admin/login/", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert admin_host in resp["Location"] + + resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=public_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/" + + resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=snapshot_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/" + + resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=original_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/" + + resp = client.get("/admin/login/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + + resp = client.get(f"/{snapshot.url_path}", HTTP_HOST=admin_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{snapshot_host}" + + resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=admin_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{snapshot_host}" + + resp = client.get("/static/jquery.min.js", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert "javascript" in (resp.headers.get("Content-Type") or "") + + resp = client.get("/static/jquery.min.js", HTTP_HOST=original_host) + assert resp.status_code == 200 + assert "javascript" in (resp.headers.get("Content-Type") or "") + + print("OK") + """, + ) + + def test_snapshot_routing_and_hosts(self) -> None: + self._run( + """ + import io + import zipfile + + snapshot = get_snapshot() + output_rel, response_file, response_rel, response_output_path = get_snapshot_files(snapshot) + snapshot_id = str(snapshot.id) + snapshot_subdomain = get_snapshot_subdomain(snapshot_id) + snapshot_host = get_snapshot_host(snapshot_id) + original_host = get_original_host(snapshot.domain) + web_host = get_web_host() + host_only, port = split_host_port(SERVER_CONFIG.LISTEN_HOST) + legacy_snapshot_host = f"{snapshot_id}.{host_only}" + if port: + legacy_snapshot_host = f"{legacy_snapshot_host}:{port}" + + client = Client() + + snapshot_path = f"/{snapshot.url_path}/" + resp = client.get(snapshot_path, HTTP_HOST=web_host) + assert resp.status_code == 200 + + resp = client.get(f"/web/{snapshot.domain}", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert resp["Location"].endswith(f"/{snapshot.url_path}") + + resp = client.get(f"/{snapshot.url_path}", HTTP_HOST=web_host) + assert resp.status_code == 200 + + date_segment = snapshot.url_path.split("/")[1] + resp = client.get(f"/web/{date_segment}/{date_segment}/{snapshot_id}/", HTTP_HOST=web_host) + assert resp.status_code == 404 + + resp = client.get(f"/{snapshot.url_path}/{output_rel}", HTTP_HOST=web_host) + assert resp.status_code in (301, 302) + assert snapshot_host in resp["Location"] + + resp = client.get("/", HTTP_HOST=legacy_snapshot_host) + assert resp.status_code in (301, 302) + assert resp["Location"].startswith(f"http://{snapshot_host}") + assert snapshot_subdomain in resp["Location"] + + resp = client.get(f"/{output_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert response_body(resp) == Path(snapshot.output_dir, output_rel).read_bytes() + + resp = client.get(f"/{response_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + snapshot_body = response_body(resp) + if response_rel == "index.html": + assert f"http://{snapshot_host}/".encode() in snapshot_body + assert b"See all files..." in snapshot_body + elif response_output_path.exists(): + assert snapshot_body == response_output_path.read_bytes() + else: + assert snapshot_body == response_file.read_bytes() + + resp = client.get(f"/{response_rel}", HTTP_HOST=original_host) + assert resp.status_code == 200 + assert response_body(resp) == response_file.read_bytes() + + resp = client.get("/index.html", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + snapshot_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://{snapshot_host}/" in snapshot_html + assert "See all files..." in snapshot_html + assert ">WARC<" not in snapshot_html + assert ">Media<" not in snapshot_html + assert ">Git<" not in snapshot_html + + resp = client.get("/?files=1", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + files_html = response_body(resp).decode("utf-8", "ignore") + assert output_rel.split("/", 1)[0] in files_html + + resp = client.get("/?files=1&download=zip", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp["Content-Type"] == "application/zip" + assert ".zip" in resp["Content-Disposition"] + assert resp.streaming + with zipfile.ZipFile(io.BytesIO(response_body(resp))) as zip_file: + assert any(name.endswith(f"/{output_rel}") for name in zip_file.namelist()) + + output_dir = next((output.get("path", "").split("/", 1)[0] for output in snapshot.discover_outputs() if "/" in (output.get("path") or "")), None) + assert output_dir is not None + resp = client.get(f"/{output_dir}/", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + dir_html = response_body(resp).decode("utf-8", "ignore") + assert f"Index of {output_dir}/" in dir_html + + print("OK") + """, + ) + + def test_safe_subdomains_original_domain_host_uses_latest_matching_response(self) -> None: + self._run( + """ + from datetime import timedelta + import shutil + from django.utils import timezone + from archivebox.crawls.models import Crawl + + snapshot = get_snapshot() + original_host = get_original_host(snapshot.domain) + client = Client() + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay" + + now = timezone.now() + created_by_id = snapshot.crawl.created_by_id + created_snapshots = [] + created_crawls = [] + + def make_snapshot(url): + crawl = Crawl.objects.create(urls=url, created_by_id=created_by_id) + created_crawls.append(crawl) + snap = Snapshot.objects.create(url=url, crawl=crawl, status=Snapshot.StatusChoices.STARTED) + created_snapshots.append(snap) + return snap + + try: + fixtures = ( + (make_snapshot("https://example.com"), now + timedelta(minutes=1), "old root"), + (make_snapshot("https://example.com"), now + timedelta(minutes=2), "new root"), + (make_snapshot("https://example.com/about.html"), now + timedelta(minutes=3), "old about"), + (make_snapshot("https://example.com/about.html"), now + timedelta(minutes=4), "new about"), + ) + + for snap, stamp, content in fixtures: + snap.created_at = stamp + snap.bookmarked_at = stamp + snap.downloaded_at = stamp + snap.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"]) + responses_root = Path(snap.output_dir) / "responses" / snap.domain + responses_root.mkdir(parents=True, exist_ok=True) + rel_path = "about.html" if snap.url.endswith("/about.html") else "index.html" + (responses_root / rel_path).write_text(content, encoding="utf-8") + + resp = client.get("/", HTTP_HOST=original_host) + assert resp.status_code == 200 + root_html = response_body(resp).decode("utf-8", "ignore") + assert "new root" in root_html + assert "old root" not in root_html + + resp = client.get("/about.html", HTTP_HOST=original_host) + assert resp.status_code == 200 + about_html = response_body(resp).decode("utf-8", "ignore") + assert "new about" in about_html + assert "old about" not in about_html + finally: + for snap in created_snapshots: + shutil.rmtree(snap.output_dir, ignore_errors=True) + for crawl in created_crawls: + crawl.delete() + + print("OK") + """, + ) + + def test_safe_subdomains_original_domain_host_falls_back_to_latest_snapshot_live_page(self) -> None: + self._run( + """ + import shutil + from django.utils import timezone + from archivebox.crawls.models import Crawl + + snapshot = get_snapshot() + fallback_domain = "fallback-original-host.example" + original_host = get_original_host(fallback_domain) + client = Client() + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay" + + crawl = Crawl.objects.create(urls=f"https://{fallback_domain}", created_by_id=snapshot.crawl.created_by_id) + latest_snapshot = Snapshot.objects.create( + url=f"https://{fallback_domain}", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + stamp = timezone.now() + latest_snapshot.created_at = stamp + latest_snapshot.bookmarked_at = stamp + latest_snapshot.downloaded_at = stamp + latest_snapshot.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"]) + + try: + shutil.rmtree(Path(latest_snapshot.output_dir) / "responses", ignore_errors=True) + + resp = client.get("/", HTTP_HOST=original_host) + assert resp.status_code == 200 + html = response_body(resp).decode("utf-8", "ignore") + assert latest_snapshot.url in html + assert f"http://{get_snapshot_host(str(latest_snapshot.id))}/" in html + finally: + shutil.rmtree(latest_snapshot.output_dir, ignore_errors=True) + crawl.delete() + + print("OK") + """, + ) + + def test_safe_subdomains_original_domain_host_redirects_to_save_page_now_when_missing_and_authenticated(self) -> None: + self._run( + """ + ensure_admin_user() + client = Client() + client.login(username="testadmin", password="testpassword") + + missing_domain = "missing-original-host.example" + original_host = get_original_host(missing_domain) + resp = client.get("/", HTTP_HOST=original_host) + + assert resp.status_code in (301, 302) + assert resp["Location"] == f"http://{get_web_host()}/web/https://{missing_domain}" + + print("OK") + """, + ) + + def test_safe_subdomains_fullreplay_leaves_risky_replay_unrestricted(self) -> None: + self._run( + """ + snapshot = get_snapshot() + dangerous_rel, safe_json_rel, sniffed_rel = write_replay_fixtures(snapshot) + snapshot_host = get_snapshot_host(str(snapshot.id)) + + client = Client() + + resp = client.get(f"/{dangerous_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + assert resp.headers.get("X-Content-Type-Options") == "nosniff" + + resp = client.get(f"/{safe_json_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + + resp = client.get(f"/{sniffed_rel}", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + + print("OK") + """, + ) + + def test_safe_onedomain_nojsreplay_routes_and_neuters_risky_documents(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + dangerous_rel, safe_json_rel, sniffed_rel = write_replay_fixtures(snapshot) + snapshot_id = str(snapshot.id) + + client = Client() + base_host = SERVER_CONFIG.LISTEN_HOST + web_host = get_web_host() + admin_host = get_admin_host() + api_host = get_api_host() + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay" + assert web_host == base_host + assert admin_host == base_host + assert api_host == base_host + assert get_snapshot_host(snapshot_id) == base_host + assert get_original_host(snapshot.domain) == base_host + assert get_listen_subdomain(base_host) == "" + + replay_url = build_snapshot_url(snapshot_id, dangerous_rel) + assert replay_url == f"http://{base_host}/snapshot/{snapshot_id}/{dangerous_rel}" + + resp = client.get(f"/{snapshot.url_path}/{dangerous_rel}", HTTP_HOST=base_host) + assert resp.status_code in (301, 302) + assert resp["Location"] == replay_url + + resp = client.get("/admin/login/", HTTP_HOST=base_host) + assert resp.status_code == 200 + + resp = client.get("/api/v1/docs", HTTP_HOST=base_host) + assert resp.status_code == 200 + + resp = client.get(f"/snapshot/{snapshot_id}/{dangerous_rel}", HTTP_HOST=base_host) + assert resp.status_code == 200 + csp = resp.headers.get("Content-Security-Policy") or "" + assert "sandbox" in csp + assert "script-src 'none'" in csp + assert resp.headers.get("X-Content-Type-Options") == "nosniff" + + resp = client.get(f"/snapshot/{snapshot_id}/{safe_json_rel}", HTTP_HOST=base_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + assert resp.headers.get("X-Content-Type-Options") == "nosniff" + + resp = client.get("/snapshot/{}/singlefile/".format(snapshot_id), HTTP_HOST=base_host) + assert resp.status_code == 404 + + resp = client.get(f"/snapshot/{snapshot_id}/{sniffed_rel}", HTTP_HOST=base_host) + assert resp.status_code == 200 + csp = resp.headers.get("Content-Security-Policy") or "" + assert "sandbox" in csp + assert "script-src 'none'" in csp + + print("OK") + """, + mode="safe-onedomain-nojsreplay", + ) + + def test_unsafe_onedomain_noadmin_blocks_control_plane_and_unsafe_methods(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + dangerous_rel, _, _ = write_replay_fixtures(snapshot) + snapshot_id = str(snapshot.id) + + client = Client() + base_host = SERVER_CONFIG.LISTEN_HOST + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin" + assert SERVER_CONFIG.CONTROL_PLANE_ENABLED is False + assert SERVER_CONFIG.BLOCK_UNSAFE_METHODS is True + assert get_web_host() == base_host + assert get_admin_host() == base_host + assert get_api_host() == base_host + + for blocked_path in ("/admin/login/", "/api/v1/docs", "/add/", f"/web/{snapshot.domain}"): + resp = client.get(blocked_path, HTTP_HOST=base_host) + assert resp.status_code == 403, (blocked_path, resp.status_code) + + resp = client.post("/public/", data="x=1", content_type="application/x-www-form-urlencoded", HTTP_HOST=base_host) + assert resp.status_code == 403 + + resp = client.get(f"/snapshot/{snapshot_id}/{dangerous_rel}", HTTP_HOST=base_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + assert resp.headers.get("X-Content-Type-Options") == "nosniff" + + print("OK") + """, + mode="unsafe-onedomain-noadmin", + ) + + def test_danger_onedomain_fullreplay_keeps_control_plane_and_raw_replay(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + dangerous_rel, _, _ = write_replay_fixtures(snapshot) + snapshot_id = str(snapshot.id) + + client = Client() + base_host = SERVER_CONFIG.LISTEN_HOST + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "danger-onedomain-fullreplay" + assert SERVER_CONFIG.CONTROL_PLANE_ENABLED is True + assert get_web_host() == base_host + assert get_admin_host() == base_host + assert get_api_host() == base_host + assert build_snapshot_url(snapshot_id, dangerous_rel) == f"http://{base_host}/snapshot/{snapshot_id}/{dangerous_rel}" + + resp = client.get("/admin/login/", HTTP_HOST=base_host) + assert resp.status_code == 200 + + resp = client.get("/api/v1/docs", HTTP_HOST=base_host) + assert resp.status_code == 200 + + payload = '{"username": "testadmin", "password": "testpassword"}' + resp = client.post( + "/api/v1/auth/get_api_token", + data=payload, + content_type="application/json", + HTTP_HOST=base_host, + ) + assert resp.status_code == 200 + assert resp.json().get("token") + + resp = client.get(f"/snapshot/{snapshot_id}/{dangerous_rel}", HTTP_HOST=base_host) + assert resp.status_code == 200 + assert resp.headers.get("Content-Security-Policy") is None + assert resp.headers.get("X-Content-Type-Options") == "nosniff" + + print("OK") + """, + mode="danger-onedomain-fullreplay", + ) + + def test_onedomain_base_url_overrides_are_preserved_for_external_links(self) -> None: + self._run( + """ + snapshot = get_snapshot() + snapshot_id = str(snapshot.id) + base_host = SERVER_CONFIG.LISTEN_HOST + + assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay" + assert get_admin_host() == base_host + assert get_web_host() == base_host + + assert get_admin_base_url() == "https://admin.archivebox.example" + assert get_web_base_url() == "https://archivebox.example" + assert build_admin_url("/admin/login/") == "https://admin.archivebox.example/admin/login/" + assert build_snapshot_url(snapshot_id, "index.jsonl") == ( + f"https://archivebox.example/snapshot/{snapshot_id}/index.jsonl" + ) + + print("OK") + """, + mode="safe-onedomain-nojsreplay", + env_overrides={ + "ADMIN_BASE_URL": "https://admin.archivebox.example", + "ARCHIVE_BASE_URL": "https://archivebox.example", + }, + ) + + def test_template_and_admin_links(self) -> None: + self._run( + """ + ensure_admin_user() + snapshot = get_snapshot() + snapshot.write_html_details() + snapshot_id = str(snapshot.id) + snapshot_host = get_snapshot_host(snapshot_id) + admin_host = get_admin_host() + web_host = get_web_host() + public_host = get_public_host() + + client = Client() + + resp = client.get("/public/", HTTP_HOST=web_host) + assert resp.status_code == 200 + public_html = response_body(resp).decode("utf-8", "ignore") + assert "http://web.archivebox.localhost:8000" in public_html + + resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=web_host) + assert resp.status_code == 200 + live_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://{snapshot_host}/" in live_html + assert f"http://{public_host}/static/archive.png" in live_html + assert "?preview=1" in live_html + assert "function createMainFrame(previousFrame)" in live_html + assert "function activateCardPreview(card, link)" in live_html + assert "ensureMainFrame(true)" in live_html + assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in live_html + assert "previousFrame.src = 'about:blank'" in live_html + assert "event.stopImmediatePropagation()" in live_html + assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in live_html + assert "jQuery(link).click()" not in live_html + assert "searchParams.delete('preview')" in live_html + assert "doc.body.style.flexDirection = 'column'" in live_html + assert "doc.body.style.alignItems = 'center'" in live_html + assert "img.style.margin = '0 auto'" in live_html + assert "window.location.hash = getPreviewHashValueFromHref(rawTarget)" in live_html + assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in live_html + assert "pointer-events: none;" in live_html + assert "pointer-events: auto;" in live_html + assert 'class="thumbnail-click-overlay"' in live_html + assert "window.location.hash = getPreviewTypeFromPath(link)" not in live_html + assert ">WARC<" not in live_html + assert ">Media<" not in live_html + assert ">Git<" not in live_html + + static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore") + assert f"http://{snapshot_host}/" in static_html + assert f"http://{public_host}/static/archive.png" in static_html + assert "?preview=1" in static_html + assert "function createMainFrame(previousFrame)" in static_html + assert "function activateCardPreview(card, link)" in static_html + assert "ensureMainFrame(true)" in static_html + assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in static_html + assert "previousFrame.src = 'about:blank'" in static_html + assert "event.stopImmediatePropagation()" in static_html + assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in static_html + assert "jQuery(link).click()" not in static_html + assert "searchParams.delete('preview')" in static_html + assert "doc.body.style.flexDirection = 'column'" in static_html + assert "doc.body.style.alignItems = 'center'" in static_html + assert "img.style.margin = '0 auto'" in static_html + assert "window.location.hash = getPreviewHashValueFromHref(rawTarget)" in static_html + assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in static_html + assert "pointer-events: none;" in static_html + assert "pointer-events: auto;" in static_html + assert 'class="thumbnail-click-overlay"' in static_html + assert "window.location.hash = getPreviewTypeFromPath(link)" not in static_html + assert ">WARC<" not in static_html + assert ">Media<" not in static_html + assert ">Git<" not in static_html + + client.login(username="testadmin", password="testpassword") + resp = client.get(f"/admin/core/snapshot/{snapshot_id}/change/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + admin_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://web.archivebox.localhost:8000/{snapshot.archive_path}" in admin_html + assert f"http://{snapshot_host}/" in admin_html + + result = ArchiveResult.objects.filter(snapshot=snapshot).first() + assert result is not None + resp = client.get(f"/admin/core/archiveresult/{result.id}/change/", HTTP_HOST=admin_host) + assert resp.status_code == 200 + ar_html = response_body(resp).decode("utf-8", "ignore") + assert f"http://{snapshot_host}/" in ar_html + + print("OK") + """, + ) + + def test_snapshot_pages_preview_filesystem_text_outputs(self) -> None: + self._run( + """ + snapshot = get_snapshot() + web_host = get_web_host() + + consolelog_dir = Path(snapshot.output_dir) / "consolelog" + consolelog_dir.mkdir(parents=True, exist_ok=True) + (consolelog_dir / "console.jsonl").write_text( + '{"level":"log","text":"console preview works"}\\n' + '{"level":"warn","text":"second line"}\\n', + encoding="utf-8", + ) + + client = Client() + resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=web_host) + assert resp.status_code == 200 + live_html = response_body(resp).decode("utf-8", "ignore") + assert 'data-plugin="consolelog" data-compact="1"' in live_html + assert "console preview works" in live_html + snapshot_host = get_snapshot_host(str(snapshot.id)) + resp = client.get("/consolelog/console.jsonl?preview=1", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp["Content-Type"].startswith("text/html") + preview_html = response_body(resp).decode("utf-8", "ignore") + assert "archivebox-text-preview" in preview_html + assert "console preview works" in preview_html + + screenshot_dir = Path(snapshot.output_dir) / "screenshot" + screenshot_dir.mkdir(parents=True, exist_ok=True) + (screenshot_dir / "screenshot.png").write_bytes( + bytes.fromhex( + "89504e470d0a1a0a" + "0000000d49484452000000010000000108060000001f15c489" + "0000000d49444154789c63f8ffffff7f0009fb03fd2a86e38a" + "0000000049454e44ae426082", + ), + ) + resp = client.get("/screenshot/screenshot.png?preview=1", HTTP_HOST=snapshot_host) + assert resp.status_code == 200 + assert resp["Content-Type"].startswith("text/html") + + print("OK") + """, + ) + + def test_api_available_on_admin_and_api_hosts(self) -> None: + self._run( + """ + client = Client() + admin_host = get_admin_host() + api_host = get_api_host() + + resp = client.get("/api/v1/docs", HTTP_HOST=admin_host) + assert resp.status_code == 200 + + resp = client.get("/api/v1/docs", HTTP_HOST=api_host) + assert resp.status_code == 200 + + print("OK") + """, + ) + + def test_api_auth_token_endpoint_available_on_admin_and_api_hosts(self) -> None: + self._run( + """ + ensure_admin_user() + client = Client() + admin_host = get_admin_host() + api_host = get_api_host() + + payload = '{"username": "testadmin", "password": "testpassword"}' + + resp = client.post( + "/api/v1/auth/get_api_token", + data=payload, + content_type="application/json", + HTTP_HOST=admin_host, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("token") + + resp = client.post( + "/api/v1/auth/get_api_token", + data=payload, + content_type="application/json", + HTTP_HOST=api_host, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("token") + + print("OK") + """, + ) + + def test_api_post_with_token_on_admin_and_api_hosts(self) -> None: + self._run( + """ + ensure_admin_user() + from archivebox.api.auth import get_or_create_api_token + + token = get_or_create_api_token(get_user_model().objects.get(username="testadmin")) + assert token is not None + + client = Client() + admin_host = get_admin_host() + api_host = get_api_host() + + payload = '{"name": "apitest-tag"}' + headers = {"HTTP_X_ARCHIVEBOX_API_KEY": token.token} + + resp = client.post( + "/api/v1/core/tags/create/", + data=payload, + content_type="application/json", + HTTP_HOST=admin_host, + **headers, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("success") is True + assert data.get("tag_name") == "apitest-tag" + + resp = client.post( + "/api/v1/core/tags/create/", + data=payload, + content_type="application/json", + HTTP_HOST=api_host, + **headers, + ) + assert resp.status_code == 200 + data = resp.json() + assert data.get("success") is True + assert data.get("tag_name") == "apitest-tag" + + print("OK") + """, + ) diff --git a/archivebox/tests/test_util.py b/archivebox/tests/test_util.py new file mode 100644 index 0000000000..8ebe2ca289 --- /dev/null +++ b/archivebox/tests/test_util.py @@ -0,0 +1,31 @@ +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from threading import Thread + +from archivebox.misc.util import download_url + + +class _ExampleHandler(BaseHTTPRequestHandler): + def do_GET(self): + body = b"

    Example Domain

    " + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + +def test_download_url_downloads_content(): + server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler) + thread = Thread(target=server.serve_forever, daemon=True) + thread.start() + try: + text = download_url(f"http://127.0.0.1:{server.server_address[1]}/") + finally: + server.shutdown() + server.server_close() + thread.join(timeout=5) + + assert "Example Domain" in text diff --git a/archivebox/util.py b/archivebox/util.py deleted file mode 100644 index 814c803822..0000000000 --- a/archivebox/util.py +++ /dev/null @@ -1,335 +0,0 @@ -__package__ = 'archivebox' - -import re -import requests -import json as pyjson - -from typing import List, Optional, Any -from pathlib import Path -from inspect import signature -from functools import wraps -from hashlib import sha256 -from urllib.parse import urlparse, quote, unquote -from html import escape, unescape -from datetime import datetime, timezone -from dateparser import parse as dateparser -from requests.exceptions import RequestException, ReadTimeout - -from .vendor.base32_crockford import encode as base32_encode # type: ignore -from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding - -try: - import chardet - detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"] -except ImportError: - detect_encoding = lambda rawdata: "utf-8" - -### Parsing Helpers - -# All of these are (str) -> str -# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing -scheme = lambda url: urlparse(url).scheme.lower() -without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//') -without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//') -without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//') -without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//') -path = lambda url: urlparse(url).path -basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1] -domain = lambda url: urlparse(url).netloc -query = lambda url: urlparse(url).query -fragment = lambda url: urlparse(url).fragment -extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else '' -base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links - -without_www = lambda url: url.replace('://www.', '://', 1) -without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?') -hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20] - -urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace') -urldecode = lambda s: s and unquote(s) -htmlencode = lambda s: s and escape(s, quote=True) -htmldecode = lambda s: s and unescape(s) - -short_ts = lambda ts: str(parse_date(ts).timestamp()).split('.')[0] -ts_to_date_str = lambda ts: ts and parse_date(ts).strftime('%Y-%m-%d %H:%M') -ts_to_iso = lambda ts: ts and parse_date(ts).isoformat() - - -URL_REGEX = re.compile( - r'(?=(' - r'http[s]?://' # start matching from allowed schemes - r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters - r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols - r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes - r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols - r'))', - re.IGNORECASE, -) - -COLOR_REGEX = re.compile(r'\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m') - -def is_static_file(url: str): - # TODO: the proper way is with MIME type detection + ext, not only extension - from .config import STATICFILE_EXTENSIONS - return extension(url).lower() in STATICFILE_EXTENSIONS - - -def enforce_types(func): - """ - Enforce function arg and kwarg types at runtime using its python3 type hints - """ - # TODO: check return type as well - - @wraps(func) - def typechecked_function(*args, **kwargs): - sig = signature(func) - - def check_argument_type(arg_key, arg_val): - try: - annotation = sig.parameters[arg_key].annotation - except KeyError: - annotation = None - - if annotation is not None and annotation.__class__ is type: - if not isinstance(arg_val, annotation): - raise TypeError( - '{}(..., {}: {}) got unexpected {} argument {}={}'.format( - func.__name__, - arg_key, - annotation.__name__, - type(arg_val).__name__, - arg_key, - str(arg_val)[:64], - ) - ) - - # check args - for arg_val, arg_key in zip(args, sig.parameters): - check_argument_type(arg_key, arg_val) - - # check kwargs - for arg_key, arg_val in kwargs.items(): - check_argument_type(arg_key, arg_val) - - return func(*args, **kwargs) - - return typechecked_function - - -def docstring(text: Optional[str]): - """attach the given docstring to the decorated function""" - def decorator(func): - if text: - func.__doc__ = text - return func - return decorator - - -@enforce_types -def str_between(string: str, start: str, end: str=None) -> str: - """(12345, , ) -> 12345""" - - content = string.split(start, 1)[-1] - if end is not None: - content = content.rsplit(end, 1)[0] - - return content - - -@enforce_types -def parse_date(date: Any) -> Optional[datetime]: - """Parse unix timestamps, iso format, and human-readable strings""" - - if date is None: - return None - - if isinstance(date, datetime): - if date.tzinfo is None: - return date.replace(tzinfo=timezone.utc) - - assert date.tzinfo.utcoffset(datetime.now()).seconds == 0, 'Refusing to load a non-UTC date!' - return date - - if isinstance(date, (float, int)): - date = str(date) - - if isinstance(date, str): - return dateparser(date, settings={'TIMEZONE': 'UTC'}).replace(tzinfo=timezone.utc) - - raise ValueError('Tried to parse invalid date! {}'.format(date)) - - -@enforce_types -def download_url(url: str, timeout: int=None) -> str: - """Download the contents of a remote url and return the text""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT - timeout = timeout or TIMEOUT - response = requests.get( - url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, - timeout=timeout, - ) - - content_type = response.headers.get('Content-Type', '') - encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text) - - if encoding is not None: - response.encoding = encoding - - return response.text - -@enforce_types -def get_headers(url: str, timeout: int=None) -> str: - """Download the contents of a remote url and return the headers""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT - timeout = timeout or TIMEOUT - - try: - response = requests.head( - url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, - timeout=timeout, - allow_redirects=True, - ) - if response.status_code >= 400: - raise RequestException - except ReadTimeout: - raise - except RequestException: - response = requests.get( - url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, - timeout=timeout, - stream=True - ) - - return pyjson.dumps( - { - 'Status-Code': response.status_code, - **dict(response.headers), - }, - indent=4, - ) - - -@enforce_types -def chrome_args(**options) -> List[str]: - """helper to build up a chrome shell command with arguments""" - - from .config import CHROME_OPTIONS - - options = {**CHROME_OPTIONS, **options} - - if not options['CHROME_BINARY']: - raise Exception('Could not find any CHROME_BINARY installed on your system') - - cmd_args = [options['CHROME_BINARY']] - - if options['CHROME_HEADLESS']: - cmd_args += ('--headless',) - - if not options['CHROME_SANDBOX']: - # assume this means we are running inside a docker container - # in docker, GPU support is limited, sandboxing is unecessary, - # and SHM is limited to 64MB by default (which is too low to be usable). - cmd_args += ( - '--no-sandbox', - '--disable-gpu', - '--disable-dev-shm-usage', - '--disable-software-rasterizer', - '--run-all-compositor-stages-before-draw', - '--hide-scrollbars', - ) - - - if not options['CHECK_SSL_VALIDITY']: - cmd_args += ('--disable-web-security', '--ignore-certificate-errors') - - if options['CHROME_USER_AGENT']: - cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),) - - if options['RESOLUTION']: - cmd_args += ('--window-size={}'.format(options['RESOLUTION']),) - - if options['TIMEOUT']: - cmd_args += ('--timeout={}'.format(options['TIMEOUT'] * 1000),) - - if options['CHROME_USER_DATA_DIR']: - cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) - - return cmd_args - - -def ansi_to_html(text): - """ - Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html - """ - from .config import COLOR_DICT - - TEMPLATE = '
    ' - text = text.replace('[m', '
    ') - - def single_sub(match): - argsdict = match.groupdict() - if argsdict['arg_3'] is None: - if argsdict['arg_2'] is None: - _, color = 0, argsdict['arg_1'] - else: - _, color = argsdict['arg_1'], argsdict['arg_2'] - else: - _, color = argsdict['arg_3'], argsdict['arg_2'] - - return TEMPLATE.format(COLOR_DICT[color][0]) - - return COLOR_REGEX.sub(single_sub, text) - - -class AttributeDict(dict): - """Helper to allow accessing dict values via Example.key or Example['key']""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Recursively convert nested dicts to AttributeDicts (optional): - # for key, val in self.items(): - # if isinstance(val, dict) and type(val) is not AttributeDict: - # self[key] = AttributeDict(val) - - def __getattr__(self, attr: str) -> Any: - return dict.__getitem__(self, attr) - - def __setattr__(self, attr: str, value: Any) -> None: - return dict.__setitem__(self, attr, value) - - -class ExtendedEncoder(pyjson.JSONEncoder): - """ - Extended json serializer that supports serializing several model - fields and objects - """ - - def default(self, obj): - cls_name = obj.__class__.__name__ - - if hasattr(obj, '_asdict'): - return obj._asdict() - - elif isinstance(obj, bytes): - return obj.decode() - - elif isinstance(obj, datetime): - return obj.isoformat() - - elif isinstance(obj, Exception): - return '{}: {}'.format(obj.__class__.__name__, obj) - - elif isinstance(obj, Path): - return str(obj) - - elif cls_name in ('dict_items', 'dict_keys', 'dict_values'): - return tuple(obj) - - return pyjson.JSONEncoder.default(self, obj) - diff --git a/archivebox/uuid_compat.py b/archivebox/uuid_compat.py new file mode 100755 index 0000000000..dbccb1634b --- /dev/null +++ b/archivebox/uuid_compat.py @@ -0,0 +1,17 @@ +"""UUID7 compatibility layer.""" + +import sys +import uuid +from importlib import import_module + +if sys.version_info >= (3, 14): + _UUID7_GENERATOR = getattr(uuid, "uuid7") +else: + _UUID7_GENERATOR = getattr(import_module("uuid_extensions"), "uuid7") + + +def uuid7() -> uuid.UUID: + return _UUID7_GENERATOR() + + +__all__ = ["uuid7"] diff --git a/archivebox/vendor/atomicwrites.py b/archivebox/vendor/atomicwrites.py deleted file mode 120000 index 73abfe4caf..0000000000 --- a/archivebox/vendor/atomicwrites.py +++ /dev/null @@ -1 +0,0 @@ -python-atomicwrites/atomicwrites/__init__.py \ No newline at end of file diff --git a/archivebox/vendor/base32-crockford b/archivebox/vendor/base32-crockford deleted file mode 160000 index 1ffb602148..0000000000 --- a/archivebox/vendor/base32-crockford +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1ffb6021485b666ea6a562abd0a1ea6f7021188f diff --git a/archivebox/vendor/base32_crockford.py b/archivebox/vendor/base32_crockford.py deleted file mode 120000 index a5d9c64f54..0000000000 --- a/archivebox/vendor/base32_crockford.py +++ /dev/null @@ -1 +0,0 @@ -base32-crockford/base32_crockford.py \ No newline at end of file diff --git a/archivebox/vendor/django-taggit b/archivebox/vendor/django-taggit deleted file mode 160000 index 1e4dca37e5..0000000000 --- a/archivebox/vendor/django-taggit +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1e4dca37e534ca70e99c39fb4198970eb8aad5aa diff --git a/archivebox/vendor/pocket b/archivebox/vendor/pocket deleted file mode 160000 index 3a0c5c7683..0000000000 --- a/archivebox/vendor/pocket +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3a0c5c76832b0e92923383af3f9831ece7901c2f diff --git a/archivebox/vendor/pocket.py b/archivebox/vendor/pocket.py deleted file mode 120000 index 37352d277e..0000000000 --- a/archivebox/vendor/pocket.py +++ /dev/null @@ -1 +0,0 @@ -pocket/pocket.py \ No newline at end of file diff --git a/archivebox/vendor/python-atomicwrites b/archivebox/vendor/python-atomicwrites deleted file mode 160000 index c35cd32eb3..0000000000 --- a/archivebox/vendor/python-atomicwrites +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c35cd32eb364d5a4210e64bf38fd1a55f329f316 diff --git a/archivebox/vendor/taggit_utils.py b/archivebox/vendor/taggit_utils.py deleted file mode 120000 index f36776dbc4..0000000000 --- a/archivebox/vendor/taggit_utils.py +++ /dev/null @@ -1 +0,0 @@ -django-taggit/taggit/utils.py \ No newline at end of file diff --git a/archivebox/workers/__init__.py b/archivebox/workers/__init__.py new file mode 100644 index 0000000000..d8ddf76e59 --- /dev/null +++ b/archivebox/workers/__init__.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.workers" +__order__ = 100 + + +def register_admin(admin_site): + from archivebox.workers.admin import register_admin + + register_admin(admin_site) diff --git a/archivebox/workers/admin.py b/archivebox/workers/admin.py new file mode 100644 index 0000000000..134eaf59e2 --- /dev/null +++ b/archivebox/workers/admin.py @@ -0,0 +1,12 @@ +""" +Workers admin module. + +Background runner processes do not need Django admin registration. +""" + +__package__ = "archivebox.workers" + + +def register_admin(admin_site): + """No models to register - workers are process-based, not Django models.""" + pass diff --git a/archivebox/workers/apps.py b/archivebox/workers/apps.py new file mode 100644 index 0000000000..7bb78428c5 --- /dev/null +++ b/archivebox/workers/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class WorkersConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.workers" + label = "workers" diff --git a/archivebox/workers/management/__init__.py b/archivebox/workers/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/workers/management/commands/__init__.py b/archivebox/workers/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/workers/management/commands/runner_watch.py b/archivebox/workers/management/commands/runner_watch.py new file mode 100644 index 0000000000..9cb82152f8 --- /dev/null +++ b/archivebox/workers/management/commands/runner_watch.py @@ -0,0 +1,118 @@ +from django.core.management.base import BaseCommand + + +class Command(BaseCommand): + help = "Watch the runserver autoreload PID file and restart the background runner on reloads." + + def add_arguments(self, parser): + parser.add_argument( + "--pidfile", + default=None, + help="Path to runserver pidfile to watch", + ) + parser.add_argument( + "--interval", + type=float, + default=1.0, + help="Polling interval in seconds", + ) + + def handle(self, *args, **kwargs): + import os + import time + + import psutil + + from archivebox.config.common import STORAGE_CONFIG + from archivebox.machine.models import Machine, Process + from archivebox.workers.supervisord_util import ( + RUNNER_WORKER, + get_existing_supervisord_process, + get_worker, + start_worker, + stop_worker, + ) + + pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE") + if not pidfile: + pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid") + + interval = max(0.2, float(kwargs.get("interval", 1.0))) + last_pid = None + + def stop_duplicate_watchers() -> None: + current_pid = os.getpid() + for proc in psutil.process_iter(["pid", "cmdline"]): + if proc.info["pid"] == current_pid: + continue + cmdline = proc.info.get("cmdline") or [] + if not cmdline: + continue + if "runner_watch" not in " ".join(cmdline): + continue + if not any(str(arg) == f"--pidfile={pidfile}" or str(arg) == pidfile for arg in cmdline): + continue + try: + proc.terminate() + proc.wait(timeout=2.0) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired): + try: + proc.kill() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + def get_supervisor(): + supervisor = get_existing_supervisord_process() + if supervisor is None: + raise RuntimeError("runner_watch requires a running supervisord process") + return supervisor + + stop_duplicate_watchers() + start_worker(get_supervisor(), RUNNER_WORKER, lazy=True) + + def restart_runner() -> None: + Process.cleanup_stale_running() + Process.cleanup_orphaned_workers() + machine = Machine.current() + + running = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type=Process.TypeChoices.ORCHESTRATOR, + ) + for proc in running: + try: + proc.kill_tree(graceful_timeout=0.5) + except Exception: + continue + + supervisor = get_supervisor() + + try: + stop_worker(supervisor, RUNNER_WORKER["name"]) + except Exception: + pass + + start_worker(supervisor, RUNNER_WORKER) + + def runner_running() -> bool: + proc = get_worker(get_supervisor(), RUNNER_WORKER["name"]) + return bool(proc and proc.get("statename") == "RUNNING") + + while True: + try: + if os.path.exists(pidfile): + with open(pidfile) as handle: + pid = handle.read().strip() or None + else: + pid = None + + if pid and pid != last_pid: + restart_runner() + last_pid = pid + elif not runner_running(): + restart_runner() + except Exception: + pass + + time.sleep(interval) diff --git a/archivebox/workers/migrations/__init__.py b/archivebox/workers/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/workers/models.py b/archivebox/workers/models.py new file mode 100644 index 0000000000..1825440432 --- /dev/null +++ b/archivebox/workers/models.py @@ -0,0 +1,459 @@ +__package__ = "archivebox.workers" + +from typing import ClassVar +from collections.abc import Iterable +from datetime import datetime, timedelta +from statemachine.mixins import MachineMixin + +from django.db import models +from django.core import checks +from django.utils import timezone +from django.utils.functional import classproperty +from django_stubs_ext.db.models import TypedModelMeta + +from statemachine import registry, StateMachine, State + + +class DefaultStatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + STARTED = "started", "Started" + SEALED = "sealed", "Sealed" + + +default_status_field: models.CharField = models.CharField( + choices=DefaultStatusChoices.choices, + max_length=15, + default=DefaultStatusChoices.QUEUED, + null=False, + blank=False, + db_index=True, +) +default_retry_at_field: models.DateTimeField = models.DateTimeField(default=timezone.now, null=True, blank=True, db_index=True) + +ObjectState = State | str +ObjectStateList = Iterable[ObjectState] + + +class BaseModelWithStateMachine(models.Model, MachineMixin): + StatusChoices: ClassVar[type[DefaultStatusChoices]] + + # status: models.CharField + # retry_at: models.DateTimeField + + state_machine_name: str | None = None + state_field_name: str + state_machine_attr: str = "sm" + bind_events_as_methods: bool = True + + active_state: ObjectState + retry_at_field_name: str + + class Meta(TypedModelMeta): + app_label = "workers" + abstract = True + + @classmethod + def check(cls, sender=None, **kwargs): + import sys + + # Skip state machine checks during makemigrations to avoid premature registry access + if "makemigrations" in sys.argv: + return super().check(**kwargs) + + errors = super().check(**kwargs) + + found_id_field = False + found_status_field = False + found_retry_at_field = False + + for field in cls._meta.get_fields(): + if getattr(field, "_is_state_field", False): + if cls.state_field_name == field.name: + found_status_field = True + if getattr(field, "choices", None) != cls.StatusChoices.choices: + errors.append( + checks.Error( + f"{cls.__name__}.{field.name} must have choices set to {cls.__name__}.StatusChoices.choices", + hint=f"{cls.__name__}.{field.name}.choices = {getattr(field, 'choices', None)!r}", + obj=cls, + id="workers.E011", + ), + ) + if getattr(field, "_is_retry_at_field", False): + if cls.retry_at_field_name == field.name: + found_retry_at_field = True + if field.name == "id" and getattr(field, "primary_key", False): + found_id_field = True + + if not found_status_field: + errors.append( + checks.Error( + f"{cls.__name__}.state_field_name must be defined and point to a StatusField()", + hint=f"{cls.__name__}.state_field_name = {cls.state_field_name!r} but {cls.__name__}.{cls.state_field_name!r} was not found or does not refer to StatusField", + obj=cls, + id="workers.E012", + ), + ) + if not found_retry_at_field: + errors.append( + checks.Error( + f"{cls.__name__}.retry_at_field_name must be defined and point to a RetryAtField()", + hint=f"{cls.__name__}.retry_at_field_name = {cls.retry_at_field_name!r} but {cls.__name__}.{cls.retry_at_field_name!r} was not found or does not refer to RetryAtField", + obj=cls, + id="workers.E013", + ), + ) + + if not found_id_field: + errors.append( + checks.Error( + f"{cls.__name__} must have an id field that is a primary key", + hint=f"{cls.__name__}.id field missing or not configured as primary key", + obj=cls, + id="workers.E014", + ), + ) + + if not isinstance(cls.state_machine_name, str): + errors.append( + checks.Error( + f"{cls.__name__}.state_machine_name must be a dotted-import path to a StateMachine class", + hint=f"{cls.__name__}.state_machine_name = {cls.state_machine_name!r}", + obj=cls, + id="workers.E015", + ), + ) + + try: + cls.StateMachineClass + except Exception as err: + errors.append( + checks.Error( + f"{cls.__name__}.state_machine_name must point to a valid StateMachine class, but got {type(err).__name__} {err} when trying to access {cls.__name__}.StateMachineClass", + hint=f"{cls.__name__}.state_machine_name = {cls.state_machine_name!r}", + obj=cls, + id="workers.E016", + ), + ) + + if cls.INITIAL_STATE not in cls.StatusChoices.values: + errors.append( + checks.Error( + f"{cls.__name__}.StateMachineClass.initial_state must be present within {cls.__name__}.StatusChoices", + hint=f"{cls.__name__}.StateMachineClass.initial_state = {cls.StateMachineClass.initial_state!r}", + obj=cls, + id="workers.E017", + ), + ) + + if cls.ACTIVE_STATE not in cls.StatusChoices.values: + errors.append( + checks.Error( + f"{cls.__name__}.active_state must be set to a valid State present within {cls.__name__}.StatusChoices", + hint=f"{cls.__name__}.active_state = {cls.active_state!r}", + obj=cls, + id="workers.E018", + ), + ) + + for state in cls.FINAL_STATES: + if state not in cls.StatusChoices.values: + errors.append( + checks.Error( + f"{cls.__name__}.StateMachineClass.final_states must all be present within {cls.__name__}.StatusChoices", + hint=f"{cls.__name__}.StateMachineClass.final_states = {cls.StateMachineClass.final_states!r}", + obj=cls, + id="workers.E019", + ), + ) + break + return errors + + @staticmethod + def _state_to_str(state: ObjectState) -> str: + """Convert a statemachine.State, models.TextChoices.choices value, or Enum value to a str""" + return str(state.value) if isinstance(state, State) else str(state) + + @property + def RETRY_AT(self) -> datetime: + return getattr(self, self.retry_at_field_name) + + @RETRY_AT.setter + def RETRY_AT(self, value: datetime): + setattr(self, self.retry_at_field_name, value) + + @property + def STATE(self) -> str: + return getattr(self, self.state_field_name) + + @STATE.setter + def STATE(self, value: str): + setattr(self, self.state_field_name, value) + + def bump_retry_at(self, seconds: int = 10): + self.RETRY_AT = timezone.now() + timedelta(seconds=seconds) + + def update_and_requeue(self, **kwargs) -> bool: + """ + Atomically update fields and schedule retry_at for next worker tick. + Returns True if the update was successful, False if the object was modified by another worker. + """ + # Get the current retry_at to use as optimistic lock + current_retry_at = self.RETRY_AT + + # Apply the updates + for key, value in kwargs.items(): + setattr(self, key, value) + + # Try to save with optimistic locking + updated = ( + type(self) + .objects.filter( + pk=self.pk, + retry_at=current_retry_at, + ) + .update(**{k: getattr(self, k) for k in kwargs}) + ) + + if updated == 1: + self.refresh_from_db() + return True + return False + + @classmethod + def get_queue(cls): + """ + Get the sorted and filtered QuerySet of objects that are ready for processing. + Objects are ready if: + - status is not in FINAL_STATES + - retry_at is in the past (or now) + """ + return ( + cls.objects.filter( + retry_at__lte=timezone.now(), + ) + .exclude( + status__in=cls.FINAL_STATES, + ) + .order_by("retry_at") + ) + + @classmethod + def claim_for_worker(cls, obj: "BaseModelWithStateMachine", lock_seconds: int = 60) -> bool: + """ + Atomically claim a due object for processing using retry_at as the lock. + + Correct lifecycle for any state-machine-driven work item: + 1. Queue the item by setting retry_at <= now + 2. Exactly one owner claims it by moving retry_at into the future + 3. Only that owner may call .sm.tick() and perform side effects + 4. State-machine callbacks update retry_at again when the work completes, + backs off, or is re-queued + + The critical rule is that future retry_at values are already owned. + Callers must never "steal" those future timestamps and start another + copy of the same work. That is what prevents duplicate installs, hook + runs, and other concurrent side effects. + + Returns True if successfully claimed, False if another worker got it + first or the object is not currently due. + """ + updated = cls.objects.filter( + pk=obj.pk, + retry_at=obj.RETRY_AT, + retry_at__lte=timezone.now(), + ).update( + retry_at=timezone.now() + timedelta(seconds=lock_seconds), + ) + return updated == 1 + + def claim_processing_lock(self, lock_seconds: int = 60) -> bool: + """ + Claim this model instance immediately before executing one state-machine tick. + + This helper is the safe entrypoint for any direct state-machine driver + (workers, synchronous crawl dependency installers, one-off CLI helpers). + Calling `.sm.tick()` without claiming first turns retry_at into "just a + schedule" instead of the ownership lock it is meant to be. + + Returns True only for the caller that successfully moved retry_at into + the future. False means another process already owns the work item or it + is not currently due. + """ + if self.STATE in self.FINAL_STATES: + return False + if self.RETRY_AT is None: + return False + + claimed = type(self).claim_for_worker(self, lock_seconds=lock_seconds) + if claimed: + self.refresh_from_db() + return claimed + + def tick_claimed(self, lock_seconds: int = 60) -> bool: + """ + Claim ownership via retry_at and then execute exactly one `.sm.tick()`. + + Future maintainers should prefer this helper over calling `.sm.tick()` + directly whenever there is any chance another process could see the same + queued row. If this method returns False, someone else already owns the + work and the caller must not run side effects for it. + """ + if not self.claim_processing_lock(lock_seconds=lock_seconds): + return False + + tick = getattr(getattr(self, self.state_machine_attr, None), "tick", None) + if not callable(tick): + raise TypeError(f"{type(self).__name__}.{self.state_machine_attr}.tick() must be callable") + tick() + self.refresh_from_db() + return True + + @classproperty + def ACTIVE_STATE(cls) -> str: + return cls._state_to_str(cls.active_state) + + @classproperty + def INITIAL_STATE(cls) -> str: + initial_state = cls.StateMachineClass.initial_state + if initial_state is None: + raise ValueError("StateMachineClass.initial_state must not be None") + return cls._state_to_str(initial_state) + + @classproperty + def FINAL_STATES(cls) -> list[str]: + return [cls._state_to_str(state) for state in cls.StateMachineClass.final_states] + + @classproperty + def FINAL_OR_ACTIVE_STATES(cls) -> list[str]: + return [*cls.FINAL_STATES, cls.ACTIVE_STATE] + + @classmethod + def extend_choices(cls, base_choices: type[models.TextChoices]): + """ + Decorator to extend the base choices with extra choices, e.g.: + + class MyModel(ModelWithStateMachine): + + @ModelWithStateMachine.extend_choices(ModelWithStateMachine.StatusChoices) + class StatusChoices(models.TextChoices): + SUCCEEDED = 'succeeded' + FAILED = 'failed' + SKIPPED = 'skipped' + """ + assert issubclass(base_choices, models.TextChoices), ( + f"@extend_choices(base_choices) must be a TextChoices class, not {base_choices.__name__}" + ) + + def wrapper(extra_choices: type[models.TextChoices]) -> type[models.TextChoices]: + joined = {} + for item in base_choices.choices: + joined[item[0]] = item[1] + for item in extra_choices.choices: + joined[item[0]] = item[1] + joined_choices = models.TextChoices("StatusChoices", joined) + assert isinstance(joined_choices, type) + return joined_choices + + return wrapper + + @classmethod + def StatusField(cls, **kwargs) -> models.CharField: + """ + Used on subclasses to extend/modify the status field with updated kwargs. e.g.: + + class MyModel(ModelWithStateMachine): + class StatusChoices(ModelWithStateMachine.StatusChoices): + QUEUED = 'queued', 'Queued' + STARTED = 'started', 'Started' + SEALED = 'sealed', 'Sealed' + BACKOFF = 'backoff', 'Backoff' + FAILED = 'failed', 'Failed' + SKIPPED = 'skipped', 'Skipped' + + status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED) + """ + default_kwargs = default_status_field.deconstruct()[3] + updated_kwargs = {**default_kwargs, **kwargs} + field = models.CharField(**updated_kwargs) + field._is_state_field = True # type: ignore + return field + + @classmethod + def RetryAtField(cls, **kwargs) -> models.DateTimeField: + """ + Used on subclasses to extend/modify the retry_at field with updated kwargs. e.g.: + + class MyModel(ModelWithStateMachine): + retry_at = ModelWithStateMachine.RetryAtField(editable=False) + """ + default_kwargs = default_retry_at_field.deconstruct()[3] + updated_kwargs = {**default_kwargs, **kwargs} + field = models.DateTimeField(**updated_kwargs) + field._is_retry_at_field = True # type: ignore + return field + + @classproperty + def StateMachineClass(cls) -> type[StateMachine]: + """Get the StateMachine class for the given django Model that inherits from MachineMixin""" + + model_state_machine_name = getattr(cls, "state_machine_name", None) + if model_state_machine_name: + StateMachineCls = registry.get_machine_cls(model_state_machine_name) + assert issubclass(StateMachineCls, StateMachine) + return StateMachineCls + raise NotImplementedError("ActorType must define .state_machine_name that points to a valid StateMachine") + + +class ModelWithStateMachine(BaseModelWithStateMachine): + StatusChoices = DefaultStatusChoices + + status: models.CharField = BaseModelWithStateMachine.StatusField() + retry_at: models.DateTimeField = BaseModelWithStateMachine.RetryAtField() + + state_machine_name: str | None # e.g. 'core.models.ArchiveResultMachine' + state_field_name: str = "status" + state_machine_attr: str = "sm" + bind_events_as_methods: bool = True + + active_state = StatusChoices.STARTED + retry_at_field_name: str = "retry_at" + + class Meta(BaseModelWithStateMachine.Meta): + abstract = True + + +class BaseStateMachine(StateMachine): + """ + Base class for all ArchiveBox state machines. + + Eliminates boilerplate __init__, __repr__, __str__ methods that were + duplicated across all 4 state machines (Snapshot, ArchiveResult, Crawl, Binary). + + Subclasses must set model_attr_name to specify the attribute name + (e.g., 'snapshot', 'archiveresult', 'crawl', 'binary'). + + Example usage: + class SnapshotMachine(BaseStateMachine): + model_attr_name = 'snapshot' + + # States and transitions... + queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True) + # ... + + The model instance is accessible via self.{model_attr_name} + (e.g., self.snapshot, self.archiveresult, etc.) + """ + + model_attr_name: str = "obj" # Override in subclasses + + def __init__(self, obj, *args, **kwargs): + setattr(self, self.model_attr_name, obj) + super().__init__(obj, *args, **kwargs) + + def __repr__(self) -> str: + obj = getattr(self, self.model_attr_name) + return f"{self.__class__.__name__}[{obj.id}]" + + def __str__(self) -> str: + return self.__repr__() diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py new file mode 100644 index 0000000000..9304a6a4a5 --- /dev/null +++ b/archivebox/workers/supervisord_util.py @@ -0,0 +1,682 @@ +__package__ = "archivebox.workers" + +import sys +import time +import socket +import psutil +import shutil +import subprocess +import shlex + +from typing import cast +from collections.abc import Iterator +from pathlib import Path +from functools import cache + +from rich import print +from supervisor.xmlrpc import SupervisorTransport +from xmlrpc.client import ServerProxy + +from archivebox.config import CONSTANTS +from archivebox.config.paths import get_or_create_working_tmp_dir +from archivebox.config.permissions import ARCHIVEBOX_USER +from archivebox.misc.logging import STDERR +from archivebox.misc.logging_util import pretty_path + +LOG_FILE_NAME = "supervisord.log" +CONFIG_FILE_NAME = "supervisord.conf" +PID_FILE_NAME = "supervisord.pid" +WORKERS_DIR_NAME = "workers" + +# Global reference to supervisord process for cleanup +_supervisord_proc = None + + +def _shell_join(args: list[str]) -> str: + return shlex.join(args) + + +RUNNER_WORKER = { + "name": "worker_runner", + "command": _shell_join([sys.executable, "-m", "archivebox", "run", "--daemon"]), + "autostart": "false", + "autorestart": "true", + "stdout_logfile": "logs/worker_runner.log", + "redirect_stderr": "true", +} + +RUNNER_WATCH_WORKER = lambda pidfile: { + "name": "worker_runner_watch", + "command": _shell_join([sys.executable, "-m", "archivebox", "manage", "runner_watch", f"--pidfile={pidfile}"]), + "autostart": "false", + "autorestart": "true", + "stdout_logfile": "logs/worker_runner_watch.log", + "redirect_stderr": "true", +} + +SERVER_WORKER = lambda host, port: { + "name": "worker_daphne", + "command": _shell_join( + [ + sys.executable, + "-m", + "daphne", + f"--bind={host}", + f"--port={port}", + "--application-close-timeout=600", + "archivebox.core.asgi:application", + ], + ), + "autostart": "false", + "autorestart": "true", + "stdout_logfile": "logs/worker_daphne.log", + "redirect_stderr": "true", +} + + +def RUNSERVER_WORKER(host: str, port: str, *, reload: bool, pidfile: str | None = None, nothreading: bool = False): + command = [sys.executable, "-m", "archivebox", "manage", "runserver", f"{host}:{port}"] + if not reload: + command.append("--noreload") + if nothreading: + command.append("--nothreading") + + environment = ['ARCHIVEBOX_RUNSERVER="1"'] + if reload: + assert pidfile, "RUNSERVER_WORKER requires a pidfile when reload=True" + environment.extend( + [ + 'ARCHIVEBOX_AUTORELOAD="1"', + f'ARCHIVEBOX_RUNSERVER_PIDFILE="{pidfile}"', + ], + ) + + return { + "name": "worker_runserver", + "command": _shell_join(command), + "environment": ",".join(environment), + "autostart": "false", + "autorestart": "true", + "stdout_logfile": "logs/worker_runserver.log", + "redirect_stderr": "true", + } + + +def is_port_in_use(host: str, port: int) -> bool: + """Check if a port is already in use.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind((host, port)) + return False + except OSError: + return True + + +@cache +def get_sock_file(): + """Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits""" + TMP_DIR = get_or_create_working_tmp_dir(autofix=True, quiet=False) + assert TMP_DIR, "Failed to find or create a writable TMP_DIR!" + socket_file = TMP_DIR / "supervisord.sock" + + return socket_file + + +def follow(file, sleep_sec=0.1) -> Iterator[str]: + """Yield each line from a file as they are written. + `sleep_sec` is the time to sleep after empty reads.""" + line = "" + while True: + tmp = file.readline() + if tmp is not None and tmp != "": + line += tmp + if line.endswith("\n"): + yield line + line = "" + elif sleep_sec: + time.sleep(sleep_sec) + + +def create_supervisord_config(): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + + config_content = f""" +[supervisord] +nodaemon = true +environment = IS_SUPERVISORD_PARENT="true",COLUMNS="200" +pidfile = {PID_FILE} +logfile = {LOG_FILE} +childlogdir = {CONSTANTS.LOGS_DIR} +directory = {CONSTANTS.DATA_DIR} +strip_ansi = true +nocleanup = true +user = {ARCHIVEBOX_USER} + +[unix_http_server] +file = {SOCK_FILE} +chmod = 0700 + +[supervisorctl] +serverurl = unix://{SOCK_FILE} + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[include] +files = {WORKERS_DIR}/*.conf + +""" + CONFIG_FILE.write_text(config_content) + Path.mkdir(WORKERS_DIR, exist_ok=True, parents=True) + + (WORKERS_DIR / "initial_startup.conf").write_text("") # hides error about "no files found to include" when supervisord starts + + +def create_worker_config(daemon): + """Create a supervisord worker config file for a given daemon""" + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + + Path.mkdir(WORKERS_DIR, exist_ok=True, parents=True) + + name = daemon["name"] + worker_conf = WORKERS_DIR / f"{name}.conf" + + worker_str = f"[program:{name}]\n" + for key, value in daemon.items(): + if key == "name": + continue + worker_str += f"{key}={value}\n" + worker_str += "\n" + + worker_conf.write_text(worker_str) + + +def get_existing_supervisord_process(): + SOCK_FILE = get_sock_file() + try: + transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}") + server = ServerProxy( + "http://localhost", + transport=transport, + ) # user:pass@localhost doesn't work for some reason with unix://.sock, cant seem to silence CRIT no-auth warning + current_state = cast(dict[str, int | str], server.supervisor.getState()) + if current_state["statename"] == "RUNNING": + pid = server.supervisor.getPID() + print(f"[đŸĻ¸â€â™‚ī¸] Supervisord connected (pid={pid}) via unix://{pretty_path(SOCK_FILE)}.") + return server.supervisor + except FileNotFoundError: + return None + except Exception as e: + print(f"Error connecting to existing supervisord: {str(e)}") + return None + + +def stop_existing_supervisord_process(): + global _supervisord_proc + SOCK_FILE = get_sock_file() + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + + try: + # First try to stop via the global proc reference + if _supervisord_proc and _supervisord_proc.poll() is None: + try: + print(f"[đŸĻ¸â€â™‚ī¸] Stopping supervisord process (pid={_supervisord_proc.pid})...") + _supervisord_proc.terminate() + try: + _supervisord_proc.wait(timeout=5) + except subprocess.TimeoutExpired: + _supervisord_proc.kill() + _supervisord_proc.wait(timeout=2) + except (BrokenPipeError, OSError): + pass + finally: + _supervisord_proc = None + return + + # Fallback: if pid file exists, load PID int and kill that process + try: + pid = int(PID_FILE.read_text()) + except (FileNotFoundError, ValueError): + return + + try: + print(f"[đŸĻ¸â€â™‚ī¸] Stopping supervisord process (pid={pid})...") + proc = psutil.Process(pid) + # Kill the entire process group to ensure all children are stopped + children = proc.children(recursive=True) + proc.terminate() + # Also terminate all children + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + proc.wait(timeout=5) + # Kill any remaining children + for child in children: + try: + if child.is_running(): + child.kill() + except psutil.NoSuchProcess: + pass + except psutil.NoSuchProcess: + pass + except (BrokenPipeError, OSError): + pass + finally: + try: + # clear PID file and socket file + PID_FILE.unlink(missing_ok=True) + get_sock_file().unlink(missing_ok=True) + except BaseException: + pass + + +def start_new_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + + print(f"[đŸĻ¸â€â™‚ī¸] Supervisord starting{' in background' if daemonize else ''}...") + pretty_log_path = pretty_path(LOG_FILE) + print(f" > Writing supervisord logs to: {pretty_log_path}") + print(f" > Writing task worker logs to: {pretty_log_path.replace('supervisord.log', 'worker_*.log')}") + print(f" > Using supervisord config file: {pretty_path(CONFIG_FILE)}") + print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}") + print() + + # clear out existing stale state files + shutil.rmtree(WORKERS_DIR, ignore_errors=True) + PID_FILE.unlink(missing_ok=True) + get_sock_file().unlink(missing_ok=True) + CONFIG_FILE.unlink(missing_ok=True) + + # create the supervisord config file + create_supervisord_config() + + # Open log file for supervisord output + LOG_FILE.parent.mkdir(parents=True, exist_ok=True) + log_handle = open(LOG_FILE, "a") + + if daemonize: + # Start supervisord in background (daemon mode) + subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=True, + ) + return wait_for_supervisord_ready() + else: + # Start supervisord in FOREGROUND - this will block until supervisord exits + # supervisord with nodaemon=true will run in foreground and handle signals properly + # When supervisord gets SIGINT/SIGTERM, it will stop all child processes before exiting + proc = subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=False, # Keep in same process group so signals propagate + ) + + # Store the process so we can wait on it later + global _supervisord_proc + _supervisord_proc = proc + + return wait_for_supervisord_ready() + + +def wait_for_supervisord_ready(max_wait_sec: float = 5.0, interval_sec: float = 0.1): + """Poll for supervisord readiness without a fixed startup sleep.""" + deadline = time.monotonic() + max_wait_sec + supervisor = None + while time.monotonic() < deadline: + supervisor = get_existing_supervisord_process() + if supervisor is not None: + return supervisor + time.sleep(interval_sec) + return supervisor + + +def get_or_create_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + + supervisor = get_existing_supervisord_process() + if supervisor is None: + stop_existing_supervisord_process() + supervisor = start_new_supervisord_process(daemonize=daemonize) + + # wait up to 5s in case supervisord is slow to start + if not supervisor: + for _ in range(50): + if supervisor is not None: + print() + break + sys.stdout.write(".") + sys.stdout.flush() + time.sleep(0.1) + supervisor = get_existing_supervisord_process() + else: + print() + + assert supervisor, "Failed to start supervisord or connect to it!" + supervisor.getPID() # make sure it doesn't throw an exception + + (WORKERS_DIR / "initial_startup.conf").unlink(missing_ok=True) + + return supervisor + + +def start_worker(supervisor, daemon, lazy=False): + assert supervisor.getPID() + + print(f"[đŸĻ¸â€â™‚ī¸] Supervisord starting new subprocess worker: {daemon['name']}...") + create_worker_config(daemon) + + result = supervisor.reloadConfig() + added, changed, removed = result[0] + # print(f"Added: {added}, Changed: {changed}, Removed: {removed}") + for removed in removed: + supervisor.stopProcessGroup(removed) + supervisor.removeProcessGroup(removed) + for changed in changed: + supervisor.stopProcessGroup(changed) + supervisor.removeProcessGroup(changed) + supervisor.addProcessGroup(changed) + for added in added: + supervisor.addProcessGroup(added) + + procs = [] + for _ in range(25): + procs = supervisor.getAllProcessInfo() + for proc in procs: + if proc["name"] == daemon["name"]: + # See process state diagram here: http://supervisord.org/subprocess.html + if proc["statename"] == "RUNNING": + print(f" - Worker {daemon['name']}: already {proc['statename']} ({proc['description']})") + return proc + else: + if not lazy: + supervisor.startProcessGroup(daemon["name"], True) + proc = supervisor.getProcessInfo(daemon["name"]) + print(f" - Worker {daemon['name']}: started {proc['statename']} ({proc['description']})") + return proc + + # retry in a moment in case it's slow to launch + time.sleep(0.2) + + raise Exception(f"Failed to start worker {daemon['name']}! Only found: {procs}") + + +def get_worker(supervisor, daemon_name): + try: + return supervisor.getProcessInfo(daemon_name) + except Exception: + pass + return None + + +def stop_worker(supervisor, daemon_name): + proc = get_worker(supervisor, daemon_name) + + for _ in range(10): + if not proc: + # worker does not exist (was never running or configured in the first place) + return True + + # See process state diagram here: http://supervisord.org/subprocess.html + if proc["statename"] == "STOPPED": + # worker was configured but has already stopped for some reason + supervisor.removeProcessGroup(daemon_name) + return True + else: + # worker was configured and is running, stop it now + supervisor.stopProcessGroup(daemon_name) + + # wait 500ms and then re-check to make sure it's really stopped + time.sleep(0.5) + proc = get_worker(supervisor, daemon_name) + + raise Exception(f"Failed to stop worker {daemon_name}!") + + +def tail_worker_logs(log_path: str): + get_or_create_supervisord_process(daemonize=False) + + from rich.live import Live + from rich.table import Table + + table = Table() + table.add_column("TS") + table.add_column("URL") + + try: + with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid + with open(log_path) as f: + for line in follow(f): + if "://" in line: + live.console.print(f"Working on: {line.strip()}") + # table.add_row("123124234", line.strip()) + except (KeyboardInterrupt, BrokenPipeError, OSError): + STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...") + except SystemExit: + pass + + +def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None): + """Tail multiple log files simultaneously, interleaving their output. + + Args: + log_files: List of log file paths to tail + follow: Whether to keep following (True) or just read existing content (False) + proc: Optional subprocess.Popen object - stop tailing when this process exits + """ + import re + from pathlib import Path + + # Convert relative paths to absolute paths + log_paths = [] + for log_file in log_files: + log_path = Path(log_file) + if not log_path.is_absolute(): + log_path = CONSTANTS.DATA_DIR / log_path + + # Create log file if it doesn't exist + if not log_path.exists(): + log_path.parent.mkdir(parents=True, exist_ok=True) + log_path.touch() + + log_paths.append(log_path) + + # Open all log files + file_handles = [] + for log_path in log_paths: + try: + f = open(log_path) + # Seek to end - only show NEW logs from now on, not old logs + f.seek(0, 2) # Go to end + + file_handles.append((log_path, f)) + print(f" [tailing {log_path.name}]") + except Exception as e: + sys.stderr.write(f"Warning: Could not open {log_path}: {e}\n") + + if not file_handles: + sys.stderr.write("No log files could be opened\n") + return + + print() + + try: + while follow: + # Check if the monitored process has exited + if proc is not None and proc.poll() is not None: + print(f"\n[server process exited with code {proc.returncode}]") + break + + had_output = False + # Read ALL available lines from all files (not just one per iteration) + for log_path, f in file_handles: + while True: + line = f.readline() + if not line: + break # No more lines available in this file + had_output = True + # Strip ANSI codes if present (supervisord does this but just in case) + line_clean = re.sub(r"\x1b\[[0-9;]*m", "", line.rstrip()) + if line_clean: + print(line_clean) + + # Small sleep to avoid busy-waiting (only when no output) + if not had_output: + time.sleep(0.05) + + except (KeyboardInterrupt, BrokenPipeError, OSError): + pass # Let the caller handle the cleanup message + except SystemExit: + pass + finally: + # Close all file handles + for _, f in file_handles: + try: + f.close() + except Exception: + pass + + +def watch_worker(supervisor, daemon_name, interval=5): + """loop continuously and monitor worker's health""" + while True: + proc = get_worker(supervisor, daemon_name) + if not proc: + raise Exception("Worker disappeared while running! " + daemon_name) + + if proc["statename"] == "STOPPED": + return proc + + if proc["statename"] == "RUNNING": + time.sleep(1) + continue + + if proc["statename"] in ("STARTING", "BACKOFF", "FATAL", "EXITED", "STOPPING"): + print(f"[đŸĻ¸â€â™‚ī¸] WARNING: Worker {daemon_name} {proc['statename']} {proc['description']}") + time.sleep(interval) + continue + + +def start_server_workers(host="0.0.0.0", port="8000", daemonize=False, debug=False, reload=False, nothreading=False): + from archivebox.config.common import STORAGE_CONFIG + + supervisor = get_or_create_supervisord_process(daemonize=daemonize) + + if debug: + pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid") if reload else None + server_worker = RUNSERVER_WORKER(host=host, port=port, reload=reload, pidfile=pidfile, nothreading=nothreading) + bg_workers: list[tuple[dict[str, str], bool]] = ( + [(RUNNER_WORKER, True), (RUNNER_WATCH_WORKER(pidfile), False)] if reload else [(RUNNER_WORKER, False)] + ) + log_files = ["logs/worker_runserver.log", "logs/worker_runner.log"] + if reload: + log_files.insert(1, "logs/worker_runner_watch.log") + else: + server_worker = SERVER_WORKER(host=host, port=port) + bg_workers = [(RUNNER_WORKER, False)] + log_files = ["logs/worker_daphne.log", "logs/worker_runner.log"] + + print() + start_worker(supervisor, server_worker) + print() + for worker, lazy in bg_workers: + start_worker(supervisor, worker, lazy=lazy) + print() + + if not daemonize: + try: + # Tail worker logs while supervisord runs + sys.stdout.write("Tailing worker logs (Ctrl+C to stop)...\n\n") + sys.stdout.flush() + tail_multiple_worker_logs( + log_files=log_files, + follow=True, + proc=_supervisord_proc, # Stop tailing when supervisord exits + ) + except (KeyboardInterrupt, BrokenPipeError, OSError): + STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...") + except SystemExit: + pass + except BaseException as e: + STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...") + finally: + # Ensure supervisord and all children are stopped + stop_existing_supervisord_process() + time.sleep(1.0) # Give processes time to fully terminate + + +def start_cli_workers(watch=False): + supervisor = get_or_create_supervisord_process(daemonize=False) + + start_worker(supervisor, RUNNER_WORKER) + + if watch: + try: + # Block on supervisord process - it will handle signals and stop children + if _supervisord_proc: + _supervisord_proc.wait() + else: + # Fallback to watching worker if no proc reference + watch_worker(supervisor, RUNNER_WORKER["name"]) + except (KeyboardInterrupt, BrokenPipeError, OSError): + STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...") + except SystemExit: + pass + except BaseException as e: + STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping gracefully...") + finally: + # Ensure supervisord and all children are stopped + stop_existing_supervisord_process() + time.sleep(1.0) # Give processes time to fully terminate + return [RUNNER_WORKER] + + +# def main(daemons): +# supervisor = get_or_create_supervisord_process(daemonize=False) + +# worker = start_worker(supervisor, daemons["webworker"]) +# pprint(worker) + +# print("All processes started in background.") + +# Optionally you can block the main thread until an exit signal is received: +# try: +# signal.pause() +# except KeyboardInterrupt: +# pass +# finally: +# stop_existing_supervisord_process() + +# if __name__ == "__main__": + +# DAEMONS = { +# "webworker": { +# "name": "webworker", +# "command": "python3 -m http.server 9000", +# "directory": str(cwd), +# "autostart": "true", +# "autorestart": "true", +# "stdout_logfile": cwd / "webworker.log", +# "stderr_logfile": cwd / "webworker_error.log", +# }, +# } +# main(DAEMONS, cwd) diff --git a/archivebox/workers/tasks.py b/archivebox/workers/tasks.py new file mode 100644 index 0000000000..332a0fb410 --- /dev/null +++ b/archivebox/workers/tasks.py @@ -0,0 +1,89 @@ +""" +Background task functions for queuing work to the background runner. + +These functions queue Snapshots/Crawls for processing by setting their status +to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up. + +NOTE: These functions do NOT start the runner. They assume it's already +running via `archivebox server` or will be run inline by the CLI. +""" + +__package__ = "archivebox.workers" + +from django.utils import timezone + + +def bg_add(add_kwargs: dict) -> int: + """ + Add URLs and queue them for archiving. + + Returns the number of snapshots created. + """ + from archivebox.cli.archivebox_add import add + + assert add_kwargs and add_kwargs.get("urls") + + # When called as background task, always run in background mode + add_kwargs = add_kwargs.copy() + add_kwargs["bg"] = True + + _, result = add(**add_kwargs) + + return len(result) if result else 0 + + +def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int: + """ + Queue multiple snapshots for archiving via the shared runner loop. + + Returns the number of snapshots queued. + """ + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + kwargs = kwargs or {} + + # Queue snapshots by setting status to queued with immediate retry_at + queued_count = 0 + for snapshot in snapshots: + if hasattr(snapshot, "id"): + Snapshot.objects.filter(id=snapshot.id).update( + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + downloaded_at=None, + ) + crawl_id = getattr(snapshot, "crawl_id", None) + if crawl_id: + Crawl.objects.filter(id=crawl_id).update( + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + queued_count += 1 + + return queued_count + + +def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int: + """ + Queue a single snapshot for archiving via the shared runner loop. + + Returns 1 if queued, 0 otherwise. + """ + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + if hasattr(snapshot, "id"): + Snapshot.objects.filter(id=snapshot.id).update( + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + downloaded_at=None, + ) + crawl_id = getattr(snapshot, "crawl_id", None) + if crawl_id: + Crawl.objects.filter(id=crawl_id).update( + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + return 1 + + return 0 diff --git a/archivebox/workers/tests/__init__.py b/archivebox/workers/tests/__init__.py new file mode 100644 index 0000000000..f798b10f17 --- /dev/null +++ b/archivebox/workers/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the workers module (Orchestrator, Worker, pid_utils).""" diff --git a/bin/archive b/bin/archive deleted file mode 100755 index 1387f7b7ff..0000000000 --- a/bin/archive +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -if python3 -m django --version >/dev/null 2>&1; then - python3 -m archivebox "$*" -else - echo '[X] ArchiveBox must be installed before using:' - echo " pip install archivebox" - echo - echo "Hint: Did you forget to activate a virtuenv or set your $$PATH?" - exit 2 -fi diff --git a/bin/build.sh b/bin/build.sh index b5d481151f..1ebf2d3d4e 100755 --- a/bin/build.sh +++ b/bin/build.sh @@ -24,6 +24,8 @@ cd "$REPO_DIR" ./bin/build_docker.sh echo "[√] Done. Install the built package by running:" -echo " python3 setup.py install" +echo " pip install archivebox" echo " # or" -echo " pip3 install ." +echo " sudo apt install ./dist/archivebox*.deb" +echo " # or" +echo " brew tap archivebox/archivebox && brew install archivebox" diff --git a/bin/build_brew.sh b/bin/build_brew.sh index ec54c90a7e..80d4aa11f4 100755 --- a/bin/build_brew.sh +++ b/bin/build_brew.sh @@ -11,19 +11,102 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" +cd "$REPO_DIR" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +FORMULA_FILE="$REPO_DIR/brew_dist/archivebox.rb" -CURRENT_PLAFORM="$(uname)" -REQUIRED_PLATFORM="Darwin" -if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then - echo "[!] Skipping the Homebrew package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)." - exit 0 +echo "[+] Building Homebrew formula for archivebox==${VERSION}..." + +# Create a temporary virtualenv for generating the formula +TMPDIR="$(mktemp -d)" +trap "rm -rf $TMPDIR" EXIT + +python3 -m venv "$TMPDIR/venv" +source "$TMPDIR/venv/bin/activate" + +pip install --quiet "archivebox==${VERSION}" homebrew-pypi-poet 2>/dev/null + +echo "[+] Generating resource stanzas with homebrew-pypi-poet..." +RESOURCES="$(poet archivebox)" + +# Get the sdist URL and SHA256 from PyPI JSON API (works on macOS and Linux) +SDIST_URL="" +SDIST_SHA256="" +PYPI_JSON="$(curl -fsSL "https://pypi.org/pypi/archivebox/${VERSION}/json" 2>/dev/null || echo '')" +if [ -n "$PYPI_JSON" ]; then + SDIST_URL="$(echo "$PYPI_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(next((u['url'] for u in d['urls'] if u['packagetype']=='sdist'), ''))" 2>/dev/null || echo '')" + SDIST_SHA256="$(echo "$PYPI_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(next((u['digests']['sha256'] for u in d['urls'] if u['packagetype']=='sdist'), ''))" 2>/dev/null || echo '')" fi +if [ -z "$SDIST_URL" ]; then + SDIST_URL="https://files.pythonhosted.org/packages/source/a/archivebox/archivebox-${VERSION}.tar.gz" +fi +if [ -z "$SDIST_SHA256" ]; then + # Fallback: download and compute locally + mkdir -p "$TMPDIR/sdist" + pip download --no-binary :all: --no-deps -d "$TMPDIR/sdist" "archivebox==${VERSION}" 2>/dev/null || true + SDIST_SHA256="$(shasum -a 256 "$TMPDIR/sdist/"*.tar.gz 2>/dev/null | awk '{print $1}' || echo '')" +fi + +deactivate + +echo "[+] Updating formula file: $FORMULA_FILE" + +# Build the formula from the template +cat > "$FORMULA_FILE" << RUBY +# Auto-generated by bin/build_brew.sh using homebrew-pypi-poet. +# Users install with: brew tap archivebox/archivebox && brew install archivebox + +class Archivebox < Formula + include Language::Python::Virtualenv + + desc "Self-hosted internet archiving solution" + homepage "https://github.com/ArchiveBox/ArchiveBox" + url "${SDIST_URL}" + sha256 "${SDIST_SHA256}" + license "MIT" + head "https://github.com/ArchiveBox/ArchiveBox.git", branch: "dev" + + depends_on "python@3.13" + # All other runtime deps (node, chrome, yt-dlp, etc.) are installed + # on-demand by \`archivebox install\` and should NOT be declared here. + + # Python dependency resource blocks auto-generated by homebrew-pypi-poet + # AUTOGENERATED_RESOURCES_START +${RESOURCES} + # AUTOGENERATED_RESOURCES_END + + def install + virtualenv_install_with_resources + end + + def post_install + # Initialize ArchiveBox data in the Homebrew-managed var directory + data_dir = var/"archivebox" + data_dir.mkpath + ENV["DATA_DIR"] = data_dir.to_s + system bin/"archivebox", "init" + end + + def caveats + <<~EOS + ArchiveBox data is stored in: + #{var}/archivebox + + To start archiving, run: + cd #{var}/archivebox && archivebox add 'https://example.com' + To start the web UI: + cd #{var}/archivebox && archivebox server 0.0.0.0:8000 + EOS + end -cd "$REPO_DIR/brew_dist" -# make sure archivebox.rb is up-to-date with the dependencies + test do + assert_match version.to_s, shell_output("#{bin}/archivebox version") + end +end +RUBY -echo "[+] Building Homebrew bottle" -brew install --build-bottle ./archivebox.rb -brew bottle archivebox +echo "[√] Formula updated: $FORMULA_FILE" +echo " Version: ${VERSION}" +echo " URL: ${SDIST_URL}" diff --git a/bin/build_deb.sh b/bin/build_deb.sh index 8c5c7fcffd..08c0950db9 100755 --- a/bin/build_deb.sh +++ b/bin/build_deb.sh @@ -10,35 +10,34 @@ set -o nounset set -o pipefail IFS=$'\n' - -CURRENT_PLAFORM="$(uname)" -REQUIRED_PLATFORM="Linux" -if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then - echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)." - exit 0 -fi - - REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -DEBIAN_VERSION="${DEBIAN_VERSION:-1}" cd "$REPO_DIR" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +export VERSION -if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then - source "$REPO_DIR/.venv/bin/activate" -else - echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv" -fi +# Default to amd64, can be overridden with ARCH=arm64 +export ARCH="${ARCH:-amd64}" + +echo "[+] Building .deb package for archivebox_${VERSION}_${ARCH}..." -# cleanup build artifacts -rm -Rf build deb_dist dist archivebox-*.tar.gz +# Check for nfpm +if ! command -v nfpm &>/dev/null; then + echo "[!] nfpm not found. Install it with one of:" + echo " go install github.com/goreleaser/nfpm/v2/cmd/nfpm@latest" + echo " uv tool install nfpm" + echo " brew install goreleaser/tap/nfpm" + echo " curl -sfL https://install.goreleaser.com/github.com/goreleaser/nfpm.sh | sh" + exit 1 +fi +mkdir -p "$REPO_DIR/dist" -# build source and binary packages -# make sure the stdeb.cfg file is up-to-date with all the dependencies -python3 setup.py --command-packages=stdeb.command \ - sdist_dsc --debian-version=$DEBIAN_VERSION \ - bdist_deb +nfpm package \ + --config "$REPO_DIR/pkg/debian/nfpm.yaml" \ + --packager deb \ + --target "$REPO_DIR/dist/" -# should output deb_dist/archivebox_0.5.4-1.{deb,changes,buildinfo,tar.gz} +echo +echo "[√] Built .deb package:" +ls -la "$REPO_DIR/dist/"archivebox*.deb diff --git a/bin/build_docker.sh b/bin/build_docker.sh index 57cb46371b..a0c0b4d517 100755 --- a/bin/build_docker.sh +++ b/bin/build_docker.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# ./bin/build_docker.sh dev 'linux/arm/v7' ### Bash Environment Setup # http://redsymbol.net/articles/unofficial-bash-strict-mode/ @@ -8,26 +9,94 @@ set -o errexit set -o errtrace set -o nounset set -o pipefail -IFS=$'\n' +IFS=$' ' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')" cd "$REPO_DIR" -which docker > /dev/null +which docker > /dev/null || exit 1 +which jq > /dev/null || exit 1 +# which pdm > /dev/null || exit 1 + +declare -a TAG_NAMES=("$@") +BRANCH_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +GIT_SHA=sha-"$(git rev-parse --short HEAD)" +SELECTED_PLATFORMS="linux/amd64,linux/arm64" + +# if not already in TAG_NAMES, add GIT_SHA and BRANCH_NAME +if ! echo "${TAG_NAMES[@]}" | grep -q "$GIT_SHA"; then + TAG_NAMES+=("$GIT_SHA") +fi +if ! echo "${TAG_NAMES[@]}" | grep -q "$BRANCH_NAME"; then + TAG_NAMES+=("$BRANCH_NAME") +fi +if ! echo "${TAG_NAMES[@]}" | grep -q "$VERSION"; then + TAG_NAMES+=("$VERSION") +fi + +echo "[+] Building Docker image for $SELECTED_PLATFORMS: branch=$BRANCH_NAME version=$VERSION tags=${TAG_NAMES[*]}" + +declare -a FULL_TAG_NAMES +# for each tag in TAG_NAMES, add archivebox/archivebox:tag and its mirrors to FULL_TAG_NAMES +for TAG_NAME in "${TAG_NAMES[@]}"; do + [[ "$TAG_NAME" == "" ]] && continue + FULL_TAG_NAMES+=("-t archivebox/archivebox:$TAG_NAME") # ArchiveBox official Docker repo + FULL_TAG_NAMES+=("-t ghcr.io/archivebox/archivebox:$TAG_NAME") # Github Container Repo mirror +done +echo "${FULL_TAG_NAMES[@]}" + +function check_platforms() { + INSTALLED_PLATFORMS="$(docker buildx inspect | grep 'Platforms:' )" + + for REQUIRED_PLATFORM in ${SELECTED_PLATFORMS//,/$IFS}; do + echo "[+] Checking for: $REQUIRED_PLATFORM..." + if ! (echo "$INSTALLED_PLATFORMS" | grep -q "$REQUIRED_PLATFORM"); then + return 1 + fi + done + echo + return 0 +} + +function remove_builder() { + # remove existing xbuilder + docker buildx stop xbuilder || true + docker buildx rm xbuilder || true +} + +function create_builder() { + docker buildx use xbuilder && return 0 + echo "[+] Creating new xbuilder for: $SELECTED_PLATFORMS" + echo + docker pull 'moby/buildkit:buildx-stable-1' + + # Switch to buildx builder if already present / previously created + docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$SELECTED_PLATFORMS" || true + docker buildx inspect --bootstrap || true +} + +function recreate_builder() { + # Install QEMU binaries for cross-platform building if not installed + docker run --privileged --rm 'tonistiigi/binfmt' --install all + + remove_builder + create_builder +} + +# Check if docker is ready for cross-plaform builds, if not, recreate builder +docker buildx use xbuilder >/dev/null 2>&1 || create_builder +check_platforms || (recreate_builder && check_platforms) || exit 1 + + +# Make sure pyproject.toml, pdm{.dev}.lock, requirements{-dev}.txt, package{-lock}.json are all up-to-date +# echo "[!] Make sure you've run ./bin/lock_pkgs.sh recently!" +bash ./bin/lock_pkgs.sh + echo "[+] Building archivebox:$VERSION docker image..." -docker build . -t archivebox \ - -t archivebox:latest \ - -t archivebox:$VERSION \ - -t archivebox:$SHORT_VERSION \ - -t docker.io/nikisweeting/archivebox:latest \ - -t docker.io/nikisweeting/archivebox:$VERSION \ - -t docker.io/nikisweeting/archivebox:$SHORT_VERSION \ - -t docker.io/archivebox/archivebox:latest \ - -t docker.io/archivebox/archivebox:$VERSION \ - -t docker.io/archivebox/archivebox:$SHORT_VERSION \ - -t docker.pkg.github.com/archivebox/archivebox/archivebox:latest \ - -t docker.pkg.github.com/archivebox/archivebox/archivebox:$VERSION \ - -t docker.pkg.github.com/archivebox/archivebox/archivebox:$SHORT_VERSION +# docker builder prune +# docker build . --no-cache -t archivebox-dev \ +# replace --load with --push to deploy +# shellcheck disable=SC2068 +docker buildx build --platform "$SELECTED_PLATFORMS" --load . ${FULL_TAG_NAMES[@]} diff --git a/bin/build_docs.sh b/bin/build_docs.sh index 5fa220fbf6..30d10c29ca 100755 --- a/bin/build_docs.sh +++ b/bin/build_docs.sh @@ -15,19 +15,20 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then source "$REPO_DIR/.venv/bin/activate" else - echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv" + echo "[!] Warning: No virtualenv present in $REPO_DIR/.venv" fi cd "$REPO_DIR" echo "[*] Fetching latest docs version" cd "$REPO_DIR/docs" -git pull +git fetch wiki || true +git fetch docs || true cd "$REPO_DIR" echo "[+] Building docs" -sphinx-apidoc -o docs archivebox cd "$REPO_DIR/docs" +make clean make html # open docs/_build/html/index.html to see the output cd "$REPO_DIR" diff --git a/bin/build_git.sh b/bin/build_git.sh index 19e185e82f..f289d16032 100755 --- a/bin/build_git.sh +++ b/bin/build_git.sh @@ -30,9 +30,5 @@ function bump_semver { echo "$1" | awk -F. '{$NF = $NF + 1;} 1' | sed 's/ /./g' } -OLD_VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -NEW_VERSION="$(bump_semver "$OLD_VERSION")" -echo "[*] Bumping VERSION from $OLD_VERSION to $NEW_VERSION" -contents="$(jq ".version = \"$NEW_VERSION\"" "$REPO_DIR/package.json")" && \ -echo "${contents}" > package.json - +# OLD_VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +# NEW_VERSION="$(bump_semver "$OLD_VERSION")" diff --git a/bin/build_pip.sh b/bin/build_pip.sh index 532a80584f..382ca6de58 100755 --- a/bin/build_pip.sh +++ b/bin/build_pip.sh @@ -11,21 +11,15 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" - -if [[ -f "$REPO_DIR/.venv/bin/activate" ]]; then - source "$REPO_DIR/.venv/bin/activate" -else - echo "[!] Warning: No virtualenv presesnt in $REPO_DIR.venv" -fi cd "$REPO_DIR" +# Generate pdm.lock, requirements.txt, and package-lock.json +bash ./bin/lock_pkgs.sh +source .venv/bin/activate -echo "[*] Cleaning up build dirs" -cd "$REPO_DIR" +echo "[+] Building sdist, bdist_wheel, and egg_info" rm -Rf build dist +uv build -echo "[+] Building sdist, bdist_wheel, and egg_info" -python3 setup.py \ - sdist --dist-dir=./pip_dist \ - bdist_wheel --dist-dir=./pip_dist \ - egg_info --egg-base=./pip_dist +echo +echo "[√] Finished. Built package in dist/" diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 0d61337b7a..c6c0c2ff90 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -1,45 +1,217 @@ -#!/usr/bin/env bash +#!/bin/bash -DATA_DIR="${DATA_DIR:-/data}" -ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}" +# This Docker ENTRYPOINT script is called by `docker run archivebox ...` or `docker compose run archivebox ...`. +# It takes a CMD as $* shell arguments and runs it following these setup steps: +# - Set the archivebox user to use the correct PUID & PGID +# 1. highest precedence is for valid PUID and PGID env vars passed in explicitly +# 2. fall back to DETECTED_PUID of files found within existing data dir +# 3. fall back to DEFAULT_PUID if no data dir or its owned by root +# - Create a new /data dir if necessary and set the correct ownership on it +# - Create a new /browsers dir if necessary and set the correct ownership on it +# - Check whether we're running inside QEMU emulation and show a warning if so. +# - Check that enough free space is available on / and /data +# - Drop down to archivebox user permissions and execute passed CMD command. -# Set the archivebox user UID & GID -if [[ -n "$PUID" && "$PUID" != 0 ]]; then - usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 -fi -if [[ -n "$PGID" && "$PGID" != 0 ]]; then - groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 +# Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +# set -o nounset +shopt -s nullglob +set -o errexit +set -o errtrace +set -o pipefail +# IFS=$'\n' + +# Load global invariants (set by Dockerfile during image build time, not intended to be customized by users at runtime) +export DATA_DIR="${DATA_DIR:-/data}" +export ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}" + +# Global default PUID and PGID if data dir is empty and no intended PUID+PGID is set manually by user +export DEFAULT_PUID=911 +export DEFAULT_PGID=911 + +# If user tries to set PUID and PGID to root values manually, catch and reject because root is not allowed +if [[ "${PUID:-}" == "0" ]]; then + echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr + echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr + echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr + echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr + exit 3 fi +# If data directory already exists, autodetect detect owner by looking at files within +export DETECTED_PUID="$(stat -c '%u' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PUID")" +export DETECTED_PGID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PGID")" + +# If data directory exists but is owned by root, use defaults instead of root because root is not allowed +[[ "$DETECTED_PUID" == "0" ]] && export DETECTED_PUID="$DEFAULT_PUID" +# (GUID / DETECTED_GUID is allowed to be 0 though) + +# Set archivebox user and group ids to desired PUID/PGID +usermod -o -u "${PUID:-$DETECTED_PUID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1 +groupmod -o -g "${PGID:-$DETECTED_PGID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1 + +# re-set PUID and PGID to values reported by system instead of values we tried to set, +# in case wonky filesystems or Docker setups try to play UID/GID remapping tricks on us +export PUID="$(id -u archivebox)" +export PGID="$(id -g archivebox)" -# Set the permissions of the data dir to match the archivebox user +# Check if user attempted to run it in the root of their home folder or hard drive (common mistake) +if [[ -d "$DATA_DIR/Documents" || -d "$DATA_DIR/.config" || -d "$DATA_DIR/usr" || -f "$DATA_DIR/.bashrc" || -f "$DATA_DIR/.zshrc" ]]; then + echo -e "\n[X] ERROR: ArchiveBox was run from inside a home folder" + echo -e " Make sure you are inside an existing collection directory or a new empty directory and try again" + exit 3 +fi + +# Check the permissions of the data dir (or create if it doesn't exist) if [[ -d "$DATA_DIR/archive" ]]; then - # check data directory permissions - if [[ ! "$(stat -c %u $DATA_DIR/archive)" = "$(id -u archivebox)" ]]; then - echo "Change in ownership detected, please be patient while we chown existing files" - echo "This could take some time..." - chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER -R "$DATA_DIR" + if touch "$DATA_DIR/archive/.permissions_test_safe_to_delete" 2>/dev/null; then + # It's fine, we are able to write to the data directory (as root inside the container) + rm -f "$DATA_DIR/archive/.permissions_test_safe_to_delete" + # echo "[√] Permissions are correct" + else + # the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive disappeared, etc.) + echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data/archive dir (currently owned by $(stat -c '%u' "$DATA_DIR/archive"):$(stat -c '%g' "$DATA_DIR/archive")." > /dev/stderr + echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr + echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr + echo -e " Configure the PUID & PGID environment variables to change the desired owner:" > /dev/stderr + echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" > /dev/stderr + echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap/ignore all permissions," > /dev/stderr + echo -e " leave PUID/PGID unset, disable root_squash, or use values the drive prefers (default is $DEFAULT_PUID:$DEFAULT_PGID)" > /dev/stderr + echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr + exit 3 fi else - # create data directory + # create data directory (and logs, since its the first dir ArchiveBox needs to write to) mkdir -p "$DATA_DIR/logs" - chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" fi -chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" +# check if novnc x11 $DISPLAY is available +export DISPLAY="${DISPLAY:-"novnc:0.0"}" +if ! xdpyinfo > /dev/null 2>&1; then + # cant connect to x11 display, unset it so that chrome doesn't try to connect to it and hang indefinitely + unset DISPLAY +fi + +# force set the ownership of the data dir contents to the archivebox user and group +# this is needed because Docker Desktop often does not map user permissions from the host properly +chown $PUID:$PGID "$DATA_DIR" +if ! chown $PUID:$PGID "$DATA_DIR"/* > /dev/null 2>&1; then + # users may store the ./data/archive folder on a network mount that prevents chmod/chown + # fallback to chowning everything else in ./data and leaving ./data/archive alone + find "$DATA_DIR" -type d -not -path "$DATA_DIR/archive*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1 + find "$DATA_DIR" -type f -not -path "$DATA_DIR/archive/*" -exec chown $PUID:$PGID {} \; > /dev/null 2>&1 +fi + + +# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to 'playwright install chromium' at runtime +export PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}" +mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete" +rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete" +chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH" +if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then + chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/* + chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.* + chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/* +fi + +# also create and chown tmp dir and lib dir (and their default equivalents inside data/) +# mkdir -p "$DATA_DIR"/lib/bin +# chown $PUID:$PGID "$DATA_DIR"/lib "$DATA_DIR"/lib/* +chown $PUID:$PGID "$LIB_DIR" 2>/dev/null +chown $PUID:$PGID "$LIB_DIR/*" 2>/dev/null & + +# mkdir -p "$DATA_DIR"/tmp/workers +# chown $PUID:$PGID "$DATA_DIR"/tmp "$DATA_DIR"/tmp/* +chown $PUID:$PGID "$TMP_DIR" 2>/dev/null +chown $PUID:$PGID "$TMP_DIR/*" 2>/dev/null & + +# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious) +export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')" +if [[ "$IN_QEMU" == "True" ]]; then + echo -e "\n[!] Warning: Running $(uname -m) docker image using QEMU emulation, some things will break!" > /dev/stderr + echo -e " chromium (screenshot, pdf, dom), singlefile, and any dependencies that rely on inotify will not run in QEMU." > /dev/stderr + echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr +fi + +# check disk space free on /, /data, and /data/archive, warn on <500Mb free, error on <100Mb free +export ROOT_USAGE="$(df --output=pcent,avail / | tail -n 1 | xargs)" +export ROOT_USED_PCT="${ROOT_USAGE%%%*}" +export ROOT_AVAIL_KB="$(echo "$ROOT_USAGE" | awk '{print $2}')" +if [[ "$ROOT_AVAIL_KB" -lt 100000 ]]; then + echo -e "\n[!] Warning: Docker root filesystem is completely out of space! (${ROOT_USED_PCT}% used on /)" > /dev/stderr + echo -e " you need to free up at least 100Mb in your Docker VM to continue:" > /dev/stderr + echo -e " \$ docker system prune\n" > /dev/stderr + df -kh / > /dev/stderr + exit 3 +elif [[ "$ROOT_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then + echo -e "\n[!] Warning: Docker root filesystem is running out of space! (${ROOT_USED_PCT}% used on /)" > /dev/stderr + echo -e " you may need to free up space in your Docker VM soon:" > /dev/stderr + echo -e " \$ docker system prune\n" > /dev/stderr + df -kh / > /dev/stderr +fi + +export DATA_USAGE="$(df --output=pcent,avail "$DATA_DIR" | tail -n 1 | xargs)" +export DATA_USED_PCT="${DATA_USAGE%%%*}" +export DATA_AVAIL_KB="$(echo "$DATA_USAGE" | awk '{print $2}')" +if [[ "$DATA_AVAIL_KB" -lt 100000 ]]; then + echo -e "\n[!] Warning: Docker data volume is completely out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr + echo -e " you need to free up at least 100Mb on the drive holding your data directory" > /dev/stderr + echo -e " \$ ncdu -x data\n" > /dev/stderr + df -kh "$DATA_DIR" > /dev/stderr + sleep 5 +elif [[ "$DATA_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then + echo -e "\n[!] Warning: Docker data volume is running out of space! (${DATA_USED_PCT}% used on $DATA_DIR)" > /dev/stderr + echo -e " you may need to free up space on the drive holding your data directory soon" > /dev/stderr + echo -e " \$ ncdu -x data\n" > /dev/stderr + df -kh "$DATA_DIR" > /dev/stderr +else + # data/ has space available, but check data/archive separately, because it might be on a network mount or external drive + if [[ -d "$DATA_DIR/archive" ]]; then + export ARCHIVE_USAGE="$(df --output=pcent,avail "$DATA_DIR/archive" | tail -n 1 | xargs)" + export ARCHIVE_USED_PCT="${ARCHIVE_USAGE%%%*}" + export ARCHIVE_AVAIL_KB="$(echo "$ARCHIVE_USAGE" | awk '{print $2}')" + if [[ "$ARCHIVE_AVAIL_KB" -lt 100000 ]]; then + echo -e "\n[!] Warning: data/archive folder is completely out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr + echo -e " you need to free up at least 100Mb on the drive holding your data/archive directory" > /dev/stderr + echo -e " \$ ncdu -x data/archive\n" > /dev/stderr + df -kh "$DATA_DIR/archive" > /dev/stderr + sleep 5 + elif [[ "$ARCHIVE_USED_PCT" -ge 99 ]] || [[ "$ROOT_AVAIL_KB" -lt 500000 ]]; then + echo -e "\n[!] Warning: data/archive folder is running out of space! (${ARCHIVE_USED_PCT}% used on $DATA_DIR/archive)" > /dev/stderr + echo -e " you may need to free up space on the drive holding your data/archive directory soon" > /dev/stderr + echo -e " \$ ncdu -x data/archive\n" > /dev/stderr + df -kh "$DATA_DIR/archive" > /dev/stderr + fi + fi +fi + +# set DBUS_SYSTEM_BUS_ADDRESS & DBUS_SESSION_BUS_ADDRESS +# (dbus is not actually needed, it makes chrome log fewer warnings but isn't worth making our docker images bigger) +# service dbus start >/dev/null 2>&1 & +# export $(dbus-launch --close-stderr) + + +export ARCHIVEBOX_BIN_PATH="$(which archivebox)" # Drop permissions to run commands as the archivebox user -if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then - # arg 1 is a binary, execute it verbatim - # e.g. "archivebox init" - # "/bin/bash" - # "echo" - exec gosu "$ARCHIVEBOX_USER" bash -c "$*" +if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" == "cat" || "$1" == "whoami" || "$1" == "archivebox" ]]; then + # handle "docker run archivebox /bin/somecommand --with=some args" by passing args directly to bash -c + # e.g. "docker run archivebox archivebox init: + # "docker run archivebox /venv/bin/ipython3" + # "docker run archivebox /bin/bash -c '...'" + # "docker run archivebox cat /VERSION.txt" + exec gosu "$PUID" /bin/bash -c "exec $(printf ' %q' "$@")" + # printf requotes shell parameters properly https://stackoverflow.com/a/39463371/2156113 + # gosu spawns an ephemeral bash process owned by archivebox user (bash wrapper is needed to load env vars, PATH, and setup terminal TTY) + # outermost exec hands over current process ID to inner bash process, inner exec hands over inner bash PID to user's command else - # no command given, assume args were meant to be passed to archivebox cmd - # e.g. "add https://example.com" - # "manage createsupseruser" - # "server 0.0.0.0:8000" - exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*" + # handle "docker run archivebox add some subcommand --with=args abc" by calling archivebox to run as args as CLI subcommand + # e.g. "docker run archivebox help" + # "docker run archivebox add --depth=1 https://example.com" + # "docker run archivebox manage createsupseruser" + # "docker run archivebox server 0.0.0.0:8000" + exec gosu "$PUID" "$ARCHIVEBOX_BIN_PATH" "$@" fi diff --git a/bin/docker_layers.sh b/bin/docker_layers.sh new file mode 100755 index 0000000000..880ecb4cb3 --- /dev/null +++ b/bin/docker_layers.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# This script takes a single Docker image tag (e.g. "ubuntu:latest") as input +# and shows the contents of the filesystem for each layer in the image. + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +IMAGE=$1 +# TMPDIR=$(mktemp -d) +mkdir -p "$PWD/tmp" +TMPDIR="$PWD/tmp" + +# Save the Docker image to a tar archive +echo "Saving Docker image '$IMAGE'..." +if ! docker save "$IMAGE" | pv > "${TMPDIR}/image.tar"; then + echo "Failed to save image '$IMAGE'. Make sure the image exists and Docker is running." + rm -rf "${TMPDIR}" + exit 1 +fi + +cd "${TMPDIR}" || exit 1 + +# Extract the top-level metadata of the image tar +echo "Extracting image metadata..." +pwd +tar -xzf image.tar +chmod -R 777 . +cd blobs/sha256 || exit 1 + +# Typically, the saved image will contain multiple directories each representing a layer. +# Each layer directory should have a 'layer.tar' file that contains the filesystem for that layer. +for LAYERFILE in ./*; do + if [ -f "${LAYERFILE}" ]; then + mv "${LAYERFILE}" "${LAYERFILE}.tar" + mkdir -p "${LAYERFILE}" + tar -xzf "${LAYERFILE}.tar" -C "${LAYERFILE}" + rm "${LAYERFILE}.tar" + echo "-----------------------------------------------------------------" + echo "Contents of layer: ${LAYERFILE%/}" + echo "-----------------------------------------------------------------" + # List the files in the layer.tar without extracting + tree -L 2 "${LAYERFILE}" + echo + fi +done diff --git a/bin/export_browser_history.sh b/bin/export_browser_history.sh index f595ee39e4..6aa8f4d55a 100755 --- a/bin/export_browser_history.sh +++ b/bin/export_browser_history.sh @@ -1,60 +1,163 @@ -#!/bin/bash +#!/usr/bin/env bash +# +# Helper script to export browser history and bookmarks to a format ArchiveBox can ingest. +# Usage: +# curl -O 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/bin/export_browser_history.sh' +# bash export_browser_history.sh --chrome +# bash export_browser_history.sh --firefox +# bash export_browser_history.sh --safari +# ls +# chrome_history.json +# chrome_bookmarks.json +# firefox_history.json +# firefox_bookmarks.json +# safari_history.json +# +# Assumptions: +# +# * you're running this on macOS or Linux +# * you're running a reasonably modern version of Bash +# * macOS users: `brew install bash` +# +# Dependencies: +# +# * sqlite +# * jq (for chrome bookmarks) +# +set -eo pipefail + +BROWSER_TO_EXPORT="${1?Please specify --chrome, --firefox, or --safari}" OUTPUT_DIR="$(pwd)" -if [[ "$1" == "--chrome" ]]; then - # Google Chrome / Chromium +is_linux() { + [[ "$(uname -s)" == "Linux" ]] +} + +find_firefox_places_db() { + # shellcheck disable=SC2012 # `ls` with path expansion is good enough, don't need `find` + if is_linux; then + ls ~/.mozilla/firefox/*.default*/places.sqlite | head -n 1 + else + ls ~/Library/Application\ Support/Firefox/Profiles/*.default*/places.sqlite | head -n 1 + fi +} + +find_chrome_history_db() { + if is_linux; then + local config_home="${XDG_CONFIG_HOME:-${HOME}/.config}" + for path in \ + "${config_home}/chromium/Default/History" \ + "${config_home}/google-chrome/Default/History"; + do + if [ -f "${path}" ]; then + echo "${path}" + return + fi + done + + echo "Unable to find Chrome history database. You can supply it manually as a second parameter." >&2 + exit 1 + else + echo ~/Library/Application\ Support/Google/Chrome/Default/History + fi +} + +export_chrome() { if [[ -e "$2" ]]; then cp "$2" "$OUTPUT_DIR/chrome_history.db.tmp" else - default=$(ls ~/Library/Application\ Support/Google/Chrome/Default/History) + default="$(find_chrome_history_db)" echo "Defaulting to history db: $default" echo "Optionally specify the path to a different sqlite history database as the 2nd argument." cp "$default" "$OUTPUT_DIR/chrome_history.db.tmp" fi - sqlite3 "$OUTPUT_DIR/chrome_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_time, 'description', title, 'href', url)) || \"]\" FROM urls;" > "$OUTPUT_DIR/chrome_history.json" - jq < "$(dirname "${2:-$default}")"/Bookmarks '.roots.other.children[] | {href: .url, description: .name, timestamp: .date_added}' > "$OUTPUT_DIR/chrome_bookmarks.json" - - rm "$DATA_DIR"/output/sources/chrome_history.db.* + sqlite3 "$OUTPUT_DIR/chrome_history.db.tmp" " + SELECT '[' || group_concat( + json_object('timestamp', last_visit_time, 'description', title, 'href', url) + ) || ']' + FROM urls;" > "$OUTPUT_DIR/chrome_history.json" + + jq '.roots.other.children[] | {href: .url, description: .name, timestamp: .date_added}' \ + < "$(dirname "${2:-$default}")"/Bookmarks \ + > "$OUTPUT_DIR/chrome_bookmarks.json" + + rm "$OUTPUT_DIR"/chrome_history.db.* echo "Chrome history exported to:" - echo " output/sources/chrome_history.json" -fi + echo " $OUTPUT_DIR/chrome_history.json" + echo " $OUTPUT_DIR/chrome_bookmarks.json" +} -if [[ "$1" == "--firefox" ]]; then - # Firefox +export_firefox() { if [[ -e "$2" ]]; then cp "$2" "$OUTPUT_DIR/firefox_history.db.tmp" else - default=$(ls ~/Library/Application\ Support/Firefox/Profiles/*.default/places.sqlite) + default="$(find_firefox_places_db)" echo "Defaulting to history db: $default" echo "Optionally specify the path to a different sqlite history database as the 2nd argument." cp "$default" "$OUTPUT_DIR/firefox_history.db.tmp" fi - - sqlite3 "$OUTPUT_DIR/firefox_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_date, 'description', title, 'href', url)) || \"]\" FROM moz_places;" > "$OUTPUT_DIR/firefox_history.json" - sqlite3 "$OUTPUT_DIR/firefox_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', b.dateAdded, 'description', b.title, 'href', f.url)) || \"]\" FROM moz_bookmarks AS b JOIN moz_places AS f ON f.id = b.fk" > "$OUTPUT_DIR/firefox_bookmarks.json" - - rm "$DATA_DIR"/output/sources/firefox_history.db.* + + sqlite3 "$OUTPUT_DIR/firefox_history.db.tmp" " + SELECT + '[' || group_concat( + json_object( + 'timestamp', last_visit_date, + 'description', title, + 'href', url + ) + ) || ']' + FROM moz_places;" > "$OUTPUT_DIR/firefox_history.json" + + sqlite3 "$OUTPUT_DIR/firefox_history.db.tmp" " + with recursive tags AS ( + select id, title, '' AS tags + FROM moz_bookmarks + where parent == 0 + UNION ALL + select c.id, p.title, c.title || ',' || tags AS tags + from moz_bookmarks AS c + JOIN tags AS p + ON c.parent = p.id + ) + + SELECT '[' || group_concat(json_object('timestamp', b.dateAdded, 'description', b.title, 'href', f.url, 'tags', tags.tags)) || ']' + FROM moz_bookmarks AS b + JOIN moz_places AS f ON f.id = b.fk + JOIN tags ON tags.id = b.parent + WHERE f.url LIKE '%://%';" > "$OUTPUT_DIR/firefox_bookmarks.json" + + rm "$OUTPUT_DIR"/firefox_history.db.* echo "Firefox history exported to:" - echo " output/sources/firefox_history.json" - echo " output/sources/firefox_bookmarks.json" -fi + echo " $OUTPUT_DIR/firefox_history.json" + echo " $OUTPUT_DIR/firefox_bookmarks.json" +} -if [[ "$1" == "--safari" ]]; then - # Safari +export_safari() { if [[ -e "$2" ]]; then cp "$2" "$OUTPUT_DIR/safari_history.db.tmp" else - default="~/Library/Safari/History.db" + default=~"/Library/Safari/History.db" echo "Defaulting to history db: $default" echo "Optionally specify the path to a different sqlite history database as the 2nd argument." cp "$default" "$OUTPUT_DIR/safari_history.db.tmp" fi - + sqlite3 "$OUTPUT_DIR/safari_history.db.tmp" "select url from history_items" > "$OUTPUT_DIR/safari_history.json" - - rm "$DATA_DIR"/output/sources/safari_history.db.* + + rm "$OUTPUT_DIR"/safari_history.db.* echo "Safari history exported to:" - echo " output/sources/safari_history.json" + echo " $OUTPUT_DIR/safari_history.json" +} + +if [[ "$BROWSER_TO_EXPORT" == "--chrome" ]]; then + export_chrome "$@" +elif [[ "$BROWSER_TO_EXPORT" == "--firefox" ]]; then + export_firefox "$@" +elif [[ "$BROWSER_TO_EXPORT" == "--safari" ]]; then + export_safari "$@" +else + echo "Unrecognized argument: $1" >&2 + exit 1 fi diff --git a/bin/kill_chrome.sh b/bin/kill_chrome.sh new file mode 100755 index 0000000000..3d6996ba05 --- /dev/null +++ b/bin/kill_chrome.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Kill zombie Chrome/Chromium processes listening on 127.0.0.1 +# Works cross-platform on macOS and Linux +# +# Usage: +# ./bin/kill_chrome.sh # Kill Chrome processes with verification +# ./bin/kill_chrome.sh --pkill # Quick kill using pkill (less precise) +# ./bin/kill_chrome.sh --help # Show this help + +set -e + +# Detect OS +OS="$(uname -s)" + +# Chrome binary patterns to search for (cross-platform) +CHROME_PATTERNS=( + "Google Chrome" + "google-chrome" + "chrome" + "chromium" + "chromium-browser" + "Chromium" +) + +# Function to kill Chrome processes +kill_chrome_processes() { + echo "Searching for Chrome processes listening on 127.0.0.1..." + local killed=0 + + for pattern in "${CHROME_PATTERNS[@]}"; do + # Find processes matching the pattern with remote debugging + if [ "$OS" = "Darwin" ]; then + # macOS + pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true) + else + # Linux + pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true) + fi + + if [ -n "$pids" ]; then + echo "Found Chrome processes ($pattern): $pids" + for pid in $pids; do + # Try regular kill first + if kill "$pid" 2>/dev/null; then + echo " Killed $pid" + killed=$((killed + 1)) + sleep 0.1 + fi + + # Check if still alive + if ps -p "$pid" > /dev/null 2>&1; then + # Check process state first to avoid attempting impossible kills + if [ "$OS" = "Darwin" ]; then + state=$(ps -o state -p "$pid" 2>/dev/null | tail -1 | tr -d ' ') + else + state=$(ps -o stat -p "$pid" 2>/dev/null | tail -1 | tr -d ' ') + fi + + # Check if it's a zombie/uninterruptible process BEFORE trying to kill + if [[ "$state" == *"Z"* ]] || [[ "$state" == *"D"* ]] || [[ "$state" == *"UNE"* ]]; then + echo " WARNING: $pid is in uninterruptible/zombie state ($state) - cannot be killed" + echo " Process will clean up automatically or requires system reboot" + else + # Try force kill + echo " Force killing $pid with -9..." + if kill -9 "$pid" 2>/dev/null; then + # Wait briefly and verify + sleep 0.2 + if ! ps -p "$pid" > /dev/null 2>&1; then + echo " Force killed $pid" + killed=$((killed + 1)) + else + echo " WARNING: $pid survived kill -9 (state: $state)" + fi + else + echo " ERROR: Failed to kill $pid (state: $state)" + fi + fi + fi + done + fi + done + + if [ $killed -eq 0 ]; then + echo "No Chrome processes listening on 127.0.0.1 found (or all are zombie/uninterruptible)" + else + echo "Successfully killed $killed Chrome process(es)" + fi + + # Show remaining Chrome processes (if any) + echo "" + echo "Remaining Chrome processes listening on 127.0.0.1:" + for pattern in "${CHROME_PATTERNS[@]}"; do + ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep || true + done | head -10 + + if [ $(ps aux | grep -iE "(google chrome|chrome|chromium)" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | wc -l) -eq 0 ]; then + echo " (none)" + fi +} + +# Alternative approach using pkill (faster but less precise) +kill_chrome_pkill() { + echo "Using pkill to kill all Chrome processes..." + + for pattern in "${CHROME_PATTERNS[@]}"; do + if pkill -9 -f "$pattern" 2>/dev/null; then + echo " Killed processes matching: $pattern" + fi + done + + sleep 0.5 + echo "Done" +} + +# Show help +show_help() { + cat << EOF +Kill zombie Chrome/Chromium processes listening on 127.0.0.1 + +Usage: + $0 [OPTIONS] + +Options: + (none) Kill Chrome processes with state verification (recommended) + --pkill, -p Quick kill using pkill (faster but less precise) + --help, -h Show this help message + +Description: + This script finds and kills Chrome/Chromium processes that are listening + on 127.0.0.1 (with --remote-debugging-port or --remote-debugging-address). + + Supports multiple Chrome binary names: + - Google Chrome / chrome / google-chrome + - Chromium / chromium / chromium-browser + + Works on macOS and Linux. + + Zombie/uninterruptible processes (state UNE/Z/D) will be detected and + reported but cannot be killed. They will clean up automatically. + +Examples: + $0 # Kill with verification + $0 --pkill # Quick kill all Chrome processes + +EOF +} + +# Parse arguments +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + show_help +elif [ "$1" = "--pkill" ] || [ "$1" = "-p" ]; then + kill_chrome_pkill +else + kill_chrome_processes +fi diff --git a/bin/lint.sh b/bin/lint.sh index 605f966d72..752c95fb8a 100755 --- a/bin/lint.sh +++ b/bin/lint.sh @@ -14,10 +14,33 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -echo "[*] Running flake8..." -flake8 archivebox && echo "√ No errors found." +cd "$DIR" + +FAILED=0 + +echo "[*] Running ruff..." +if ruff check --fix archivebox; then + echo "√ No errors found." +else + FAILED=1 +fi + +echo + +echo "[*] Running pyright..." +if pyright; then + echo "√ No errors found." +else + FAILED=1 +fi echo -echo "[*] Running mypy..." -echo "(skipping for now, run 'mypy archivebox' to run it manually)" +echo "[*] Running ty..." +if ty check --force-exclude --exclude '**/migrations/**' archivebox; then + echo "√ No errors found." +else + FAILED=1 +fi + +exit "$FAILED" diff --git a/bin/lock_pkgs.sh b/bin/lock_pkgs.sh new file mode 120000 index 0000000000..a645eb3092 --- /dev/null +++ b/bin/lock_pkgs.sh @@ -0,0 +1 @@ +setup.sh \ No newline at end of file diff --git a/bin/release.sh b/bin/release.sh index 34256fada8..0df1f9636b 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -1,38 +1,399 @@ #!/usr/bin/env bash -### Bash Environment Setup -# http://redsymbol.net/articles/unofficial-bash-strict-mode/ -# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html -# set -o xtrace -set -o errexit -set -o errtrace -set -o nounset -set -o pipefail -IFS=$'\n' - -REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -cd "$REPO_DIR" - - -# Run the linters and tests -# ./bin/lint.sh -# ./bin/test.sh - -# Run all the build scripts -./bin/build_git.sh -./bin/build_docs.sh -./bin/build_pip.sh -./bin/build_deb.sh -./bin/build_brew.sh -./bin/build_docker.sh - -# Push relase to public repositories -./bin/release_git.sh -./bin/release_docs.sh -./bin/release_pip.sh -./bin/release_deb.sh -./bin/release_brew.sh -./bin/release_docker.sh - -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -echo "[√] Done. Published version v$VERSION" +set -Eeuo pipefail +IFS=$'\n\t' + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +WORKSPACE_DIR="$(cd "${REPO_DIR}/.." && pwd)" +cd "${REPO_DIR}" + +TAG_PREFIX="v" +PYPI_PACKAGE="archivebox" + +source_optional_env() { + if [[ -f "${REPO_DIR}/.env" ]]; then + set -a + # shellcheck disable=SC1091 + source "${REPO_DIR}/.env" + set +a + fi +} + +repo_slug() { + python3 - <<'PY' +import re +import subprocess + +remote = subprocess.check_output( + ['git', 'remote', 'get-url', 'origin'], + text=True, +).strip() + +patterns = [ + r'github\.com[:/](?P[^/]+/[^/.]+)(?:\.git)?$', + r'github\.com/(?P[^/]+/[^/.]+)(?:\.git)?$', +] + +for pattern in patterns: + match = re.search(pattern, remote) + if match: + print(match.group('slug')) + raise SystemExit(0) + +raise SystemExit(f'Unable to parse GitHub repo slug from remote: {remote}') +PY +} + +default_branch() { + if [[ -n "${DEFAULT_BRANCH:-}" ]]; then + echo "${DEFAULT_BRANCH}" + return 0 + fi + if git symbolic-ref refs/remotes/origin/HEAD >/dev/null 2>&1; then + git symbolic-ref refs/remotes/origin/HEAD | sed 's#^refs/remotes/origin/##' + return 0 + fi + git remote show origin | sed -n '/HEAD branch/s/.*: //p' | head -n 1 +} + +current_version() { + python3 - <<'PY' +from pathlib import Path +import json +import re + +versions = [] +pyproject_text = Path('pyproject.toml').read_text() +pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE) +if pyproject_match: + versions.append(pyproject_match.group(1)) + +package_json = json.loads(Path('etc/package.json').read_text()) +if 'version' in package_json: + versions.append(package_json['version']) + +def parse(version: str) -> tuple[int, int, int, int, int]: + match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version) + if not match: + raise SystemExit(f'Unsupported version format: {version}') + major, minor, patch, rc = match.groups() + rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000) + return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value) + +print(max(versions, key=parse)) +PY +} + +bump_version() { + python3 - <<'PY' +from pathlib import Path +import json +import re + +def parse(version: str) -> tuple[int, int, int, int, int]: + match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version) + if not match: + raise SystemExit(f'Unsupported version format: {version}') + major, minor, patch, rc = match.groups() + rc_value = int(rc) if rc else (0 if 'rc' in version else 10_000) + return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, rc_value) + +pyproject_path = Path('pyproject.toml') +pyproject_text = pyproject_path.read_text() +pyproject_match = re.search(r'^version = "([^"]+)"$', pyproject_text, re.MULTILINE) +if not pyproject_match: + raise SystemExit('Failed to find version in pyproject.toml') + +package_path = Path('etc/package.json') +package_json = json.loads(package_path.read_text()) +if 'version' not in package_json: + raise SystemExit('Failed to find version in etc/package.json') + +current_version = max([pyproject_match.group(1), package_json['version']], key=parse) +match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', current_version) +major, minor, patch, rc = match.groups() +if 'rc' in current_version: + rc_number = int(rc or '0') + 1 + next_version = f'{major}.{minor}.{patch}rc{rc_number}' +else: + next_version = f'{major}.{minor}.{int(patch) + 1}' + +pyproject_path.write_text( + re.sub(r'^version = "[^"]+"$', f'version = "{next_version}"', pyproject_text, count=1, flags=re.MULTILINE) +) +package_json['version'] = next_version +package_path.write_text(json.dumps(package_json, indent=2) + '\n') +print(next_version) +PY +} + +read_repo_version() { + local repo_dir="$1" + if [[ ! -f "${repo_dir}/pyproject.toml" ]]; then + return 1 + fi + + python3 - "${repo_dir}/pyproject.toml" <<'PY' +from pathlib import Path +import re +import sys + +text = Path(sys.argv[1]).read_text() +match = re.search(r'^version = "([^"]+)"$', text, re.MULTILINE) +if not match: + raise SystemExit('Failed to find version') +print(match.group(1)) +PY +} + +update_internal_dependencies() { + local abxbus_version abx_pkg_version abx_plugins_version abx_dl_version + + abxbus_version="$(read_repo_version "${WORKSPACE_DIR}/abxbus" || true)" + abx_pkg_version="$(read_repo_version "${WORKSPACE_DIR}/abx-pkg" || true)" + abx_plugins_version="$(read_repo_version "${WORKSPACE_DIR}/abx-plugins" || true)" + abx_dl_version="$(read_repo_version "${WORKSPACE_DIR}/abx-dl" || true)" + + python3 - "${abxbus_version}" "${abx_pkg_version}" "${abx_plugins_version}" "${abx_dl_version}" <<'PY' +from pathlib import Path +import re +import sys + +path = Path('pyproject.toml') +text = path.read_text() +for name, version in ( + ('abxbus', sys.argv[1]), + ('abx-pkg', sys.argv[2]), + ('abx-plugins', sys.argv[3]), + ('abx-dl', sys.argv[4]), +): + if version: + text = re.sub(rf'("{re.escape(name)}>=)[^"]+(")', rf'\g<1>{version}\2', text) +path.write_text(text) +PY +} + +compare_versions() { + python3 - "$1" "$2" <<'PY' +import re +import sys + +def parse(version: str) -> tuple[int, int, int, int, int]: + match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version) + if not match: + raise SystemExit(f'Unsupported version format: {version}') + major, minor, patch, rc = match.groups() + return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0')) + +left, right = sys.argv[1], sys.argv[2] +if parse(left) > parse(right): + print('gt') +elif parse(left) == parse(right): + print('eq') +else: + print('lt') +PY +} + +latest_release_version() { + local slug="$1" + local raw_tags + raw_tags="$(gh api "repos/${slug}/releases?per_page=100" --jq '.[].tag_name' || true)" + RELEASE_TAGS="${raw_tags}" TAG_PREFIX_VALUE="${TAG_PREFIX}" python3 - <<'PY' +import os +import re + +def parse(version: str) -> tuple[int, int, int, int, int]: + match = re.fullmatch(r'(\d+)\.(\d+)\.(\d+)(?:-?rc(\d*))?$', version) + if not match: + return (-1, -1, -1, -1, -1) + major, minor, patch, rc = match.groups() + return (int(major), int(minor), int(patch), 0 if 'rc' in version else 1, int(rc or '0')) + +prefix = os.environ.get('TAG_PREFIX_VALUE', '') +versions = [line.strip() for line in os.environ.get('RELEASE_TAGS', '').splitlines() if line.strip()] +if prefix: + versions = [version[len(prefix):] if version.startswith(prefix) else version for version in versions] +if not versions: + print('') +else: + print(max(versions, key=parse)) +PY +} + +wait_for_runs() { + local slug="$1" + local event="$2" + local sha="$3" + local label="$4" + local runs_json + local attempts=0 + + while :; do + runs_json="$(GH_FORCE_TTY=0 GH_PAGER=cat gh run list --repo "${slug}" --event "${event}" --commit "${sha}" --limit 20 --json databaseId,status,conclusion,workflowName)" + if [[ "$(jq 'length' <<<"${runs_json}")" -gt 0 ]]; then + break + fi + attempts=$((attempts + 1)) + if [[ "${attempts}" -ge 30 ]]; then + echo "Timed out waiting for ${label} workflows to start" >&2 + return 1 + fi + sleep 10 + done + + while read -r run_id; do + gh run watch "${run_id}" --repo "${slug}" --exit-status + done < <(jq -r '.[].databaseId' <<<"${runs_json}") +} + +wait_for_pypi() { + local package_name="$1" + local expected_version="$2" + local attempts=0 + local published_version + + while :; do + published_version="$(curl -fsSL "https://pypi.org/pypi/${package_name}/json" | jq -r '.info.version')" + if [[ "${published_version}" == "${expected_version}" ]]; then + return 0 + fi + attempts=$((attempts + 1)) + if [[ "${attempts}" -ge 30 ]]; then + echo "Timed out waiting for ${package_name}==${expected_version} on PyPI" >&2 + return 1 + fi + sleep 10 + done +} + +run_checks() { + uv sync --all-extras --all-groups --no-cache --upgrade + uv build --all +} + +validate_release_state() { + local slug="$1" + local branch="$2" + local current latest relation + + if [[ "$(git branch --show-current)" != "${branch}" ]]; then + echo "Skipping release-state validation on non-default branch $(git branch --show-current)" + return 0 + fi + + current="$(current_version)" + latest="$(latest_release_version "${slug}")" + if [[ -z "${latest}" ]]; then + echo "No published releases found for ${slug}; release state is valid" + return 0 + fi + + relation="$(compare_versions "${current}" "${latest}")" + if [[ "${relation}" == "lt" ]]; then + echo "Current version ${current} is behind latest published version ${latest}" >&2 + return 1 + fi + + echo "Release state is valid: local=${current} latest=${latest}" +} + +create_release() { + local slug="$1" + local version="$2" + local prerelease_args=() + if [[ "${version}" == *rc* ]]; then + prerelease_args+=(--prerelease) + fi + if gh release view "${TAG_PREFIX}${version}" --repo "${slug}" >/dev/null 2>&1; then + echo "GitHub release ${TAG_PREFIX}${version} already exists" + return 0 + fi + + gh release create "${TAG_PREFIX}${version}" \ + --repo "${slug}" \ + --target "$(git rev-parse HEAD)" \ + --title "${TAG_PREFIX}${version}" \ + --generate-notes \ + "${prerelease_args[@]}" +} + +publish_artifacts() { + local version="$1" + local pypi_token="${UV_PUBLISH_TOKEN:-${PYPI_TOKEN:-${PYPI_PAT_SECRET:-}}}" + + if curl -fsSL "https://pypi.org/pypi/${PYPI_PACKAGE}/json" | jq -e --arg version "${version}" '.releases[$version] | length > 0' >/dev/null 2>&1; then + echo "${PYPI_PACKAGE} ${version} already published on PyPI" + else + if [[ -n "${pypi_token}" ]]; then + UV_PUBLISH_TOKEN="${pypi_token}" uv publish --username=__token__ dist/* + else + echo "Missing PyPI credentials: set UV_PUBLISH_TOKEN or PYPI_TOKEN" >&2 + return 1 + fi + fi + + wait_for_pypi "${PYPI_PACKAGE}" "${version}" +} + +main() { + local slug branch version latest relation + + source_optional_env + slug="$(repo_slug)" + branch="$(default_branch)" + + if [[ "${GITHUB_EVENT_NAME:-}" == "push" ]]; then + validate_release_state "${slug}" "${branch}" + return 0 + fi + + if [[ "$(git branch --show-current)" != "${branch}" ]]; then + echo "Release must run from ${branch}, found $(git branch --show-current)" >&2 + return 1 + fi + + version="$(current_version)" + latest="$(latest_release_version "${slug}")" + if [[ -z "${latest}" ]]; then + relation="gt" + else + relation="$(compare_versions "${version}" "${latest}")" + fi + + if [[ "${relation}" == "eq" ]]; then + update_internal_dependencies + version="$(bump_version)" + run_checks + + git add -A + git commit -m "release: ${TAG_PREFIX}${version}" + git push origin "${branch}" + + wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push" + elif [[ "${relation}" == "gt" ]]; then + if [[ -n "$(git status --short)" ]]; then + echo "Refusing to publish existing unreleased version ${version} with a dirty worktree" >&2 + return 1 + fi + run_checks + wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push" + else + echo "Current version ${version} is behind latest GitHub release ${latest}" >&2 + return 1 + fi + + publish_artifacts "${version}" + create_release "${slug}" "${version}" + + latest="$(latest_release_version "${slug}")" + relation="$(compare_versions "${latest}" "${version}")" + if [[ "${relation}" != "eq" ]]; then + echo "GitHub release version mismatch: expected ${version}, got ${latest}" >&2 + return 1 + fi + + echo "Released ${PYPI_PACKAGE} ${version}" +} + +main "$@" diff --git a/bin/release_brew.sh b/bin/release_brew.sh index 526d9d59b1..19fbe90a5b 100755 --- a/bin/release_brew.sh +++ b/bin/release_brew.sh @@ -11,9 +11,38 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')" cd "$REPO_DIR" -# TODO -exit 0 +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +FORMULA_FILE="$REPO_DIR/brew_dist/archivebox.rb" +TAP_REPO="ArchiveBox/homebrew-archivebox" + +if [ ! -f "$FORMULA_FILE" ]; then + echo "[!] Formula not found at $FORMULA_FILE" + echo " Run ./bin/build_brew.sh first to generate it." + exit 1 +fi + +echo "[+] Releasing Homebrew formula for archivebox==${VERSION} to ${TAP_REPO}..." + +# Clone the tap repo, update formula, commit, and push +TMPDIR="$(mktemp -d)" +trap "rm -rf $TMPDIR" EXIT + +git clone "https://github.com/${TAP_REPO}.git" "$TMPDIR/tap" +cp "$FORMULA_FILE" "$TMPDIR/tap/archivebox.rb" + +cd "$TMPDIR/tap" +git add archivebox.rb +if git diff --cached --quiet; then + echo "[i] No changes to formula, skipping release." + exit 0 +fi + +git commit -m "Update archivebox to v${VERSION}" +git push origin HEAD + +echo "[√] Homebrew formula pushed to ${TAP_REPO}" +echo " Users can install with:" +echo " brew tap archivebox/archivebox" +echo " brew install archivebox" diff --git a/bin/release_deb.sh b/bin/release_deb.sh index a470c4f37f..45779f5cf8 100755 --- a/bin/release_deb.sh +++ b/bin/release_deb.sh @@ -10,41 +10,24 @@ set -o nounset set -o pipefail IFS=$'\n' - -CURRENT_PLAFORM="$(uname)" -REQUIRED_PLATFORM="Linux" -if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then - echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)." - exit 0 -fi - - REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -DEBIAN_VERSION="${DEBIAN_VERSION:-1}" cd "$REPO_DIR" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" -echo "[+] Loading PGP keys from env vars and filesystem..." -# https://github.com/ArchiveBox/debian-archivebox/settings/secrets/actions -PGP_KEY_ID="${PGP_KEY_ID:-BC2D21B0D84E16C437300B8652423FBED1586F45}" -[[ "${PGP_PUBLIC_KEY:-}" ]] && echo "$PGP_PUBLIC_KEY" > /tmp/archivebox_gpg.key.pub -[[ "${PGP_PRIVATE_KEY:-}" ]] && echo "$PGP_PRIVATE_KEY" > /tmp/archivebox_gpg.key -gpg --import /tmp/archivebox_gpg.key.pub || true -gpg --import --allow-secret-key-import /tmp/archivebox_gpg.key || true -echo "$PGP_KEY_ID:6:" | gpg --import-ownertrust || true - -echo "[*] Signing build and changelog with PGP..." -debsign --re-sign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" +echo "[+] Releasing .deb package for archivebox==${VERSION}..." -# make sure you have this in ~/.dput.cf: -# [archivebox-ppa] -# fqdn: ppa.launchpad.net -# method: ftp -# incoming: ~archivebox/ubuntu/archivebox/ -# login: anonymous -# allow_unsigned_uploads: 0 +DEB_FILE="$(ls -1 "$REPO_DIR/dist/"archivebox*.deb 2>/dev/null | head -1)" +if [ -z "$DEB_FILE" ]; then + echo "[!] No .deb file found in dist/. Run ./bin/build_deb.sh first." + exit 1 +fi +echo "[+] Uploading $DEB_FILE to GitHub Release v${VERSION}..." +gh release upload "v${VERSION}" "$DEB_FILE" --clobber 2>/dev/null || \ + gh release create "v${VERSION}" "$DEB_FILE" --title "v${VERSION}" --generate-notes -echo "[^] Uploading to launchpad.net" -dput -f archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" +echo "[√] .deb package uploaded to GitHub Release v${VERSION}" +echo " Users can install with:" +echo " curl -fsSL https://github.com/ArchiveBox/ArchiveBox/releases/download/v${VERSION}/archivebox_${VERSION}_amd64.deb -o /tmp/archivebox.deb" +echo " sudo apt install /tmp/archivebox.deb" diff --git a/bin/release_docker.sh b/bin/release_docker.sh index 80353808d0..3a87457d87 100755 --- a/bin/release_docker.sh +++ b/bin/release_docker.sh @@ -8,18 +8,54 @@ set -o errexit set -o errtrace set -o nounset set -o pipefail -IFS=$'\n' +IFS=$' ' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" -SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')" cd "$REPO_DIR" +declare -a TAG_NAMES=("$@") +BRANCH_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" +GIT_SHA=sha-"$(git rev-parse --short HEAD)" +SELECTED_PLATFORMS="linux/amd64,linux/arm64" -echo "[^] Uploading docker image" +# if not already in TAG_NAMES, add GIT_SHA and BRANCH_NAME +if ! echo "${TAG_NAMES[@]}" | grep -q "$GIT_SHA"; then + TAG_NAMES+=("$GIT_SHA") +fi +if ! echo "${TAG_NAMES[@]}" | grep -q "$BRANCH_NAME"; then + TAG_NAMES+=("$BRANCH_NAME") +fi +if ! echo "${TAG_NAMES[@]}" | grep -q "$VERSION"; then + TAG_NAMES+=("$VERSION") +fi + +echo "[+] Building + releasing Docker image for $SELECTED_PLATFORMS: branch=$BRANCH_NAME version=$VERSION tags=${TAG_NAMES[*]}" + +declare -a FULL_TAG_NAMES +# for each tag in TAG_NAMES, add archivebox/archivebox:tag and nikisweeting/archivebox:tag to FULL_TAG_NAMES +for TAG_NAME in "${TAG_NAMES[@]}"; do + [[ "$TAG_NAME" == "" ]] && continue + FULL_TAG_NAMES+=("-t archivebox/archivebox:$TAG_NAME") + FULL_TAG_NAMES+=("-t nikisweeting/archivebox:$TAG_NAME") + FULL_TAG_NAMES+=("-t ghcr.io/archivebox/archivebox:$TAG_NAME") +done +echo "${FULL_TAG_NAMES[@]}" + + +./bin/lock_pkgs.sh + +# echo "[*] Logging in to Docker Hub & Github Container Registry" # docker login --username=nikisweeting -# docker login docker.pkg.github.com --username=pirate -docker push archivebox/archivebox:$VERSION archivebox/archivebox:$SHORT_VERSION archivebox/archivebox:latest -docker push docker.io/nikisweeting/archivebox -docker push docker.io/archivebox/archivebox -docker push docker.pkg.github.com/archivebox/archivebox/archivebox +# docker login ghcr.io --username=pirate + +echo "[^] Uploading docker image" +mkdir -p "$HOME/.cache/docker/archivebox" + +# https://docs.docker.com/build/cache/backends/ +# shellcheck disable=SC2068 +exec docker buildx build \ + --platform "$SELECTED_PLATFORMS" \ + --cache-from type=local,src="$HOME/.cache/docker/archivebox" \ + --cache-to type=local,compression=zstd,mode=min,oci-mediatypes=true,dest="$HOME/.cache/docker/archivebox" \ + --push . ${FULL_TAG_NAMES[@]} diff --git a/bin/release_docs.sh b/bin/release_docs.sh index f6f5782395..617312429a 100755 --- a/bin/release_docs.sh +++ b/bin/release_docs.sh @@ -11,7 +11,7 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" cd "$REPO_DIR" @@ -21,5 +21,5 @@ git add . git commit -am "$VERSION release" git push git tag -a "v$VERSION" -m "v$VERSION" -git push origin master +git push origin git push origin --tags diff --git a/bin/release_git.sh b/bin/release_git.sh index 4a999e343a..bf53542a10 100755 --- a/bin/release_git.sh +++ b/bin/release_git.sh @@ -11,15 +11,13 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" +VERSION="$(grep '^version = ' "${REPO_DIR}/pyproject.toml" | awk -F'"' '{print $2}')" cd "$REPO_DIR" # Push build to github echo "[^] Pushing release commit + tag to Github" -git commit -am "$VERSION release" -git tag -a "v$VERSION" -m "v$VERSION" -git push origin master -git push origin --tags +git tag -f -a "v$VERSION" -m "v$VERSION" +git push origin -f --tags echo " To finish publishing the release go here:" echo " https://github.com/ArchiveBox/ArchiveBox/releases/new" diff --git a/bin/release_pip.sh b/bin/release_pip.sh index a6b605bbd2..8831152218 100755 --- a/bin/release_pip.sh +++ b/bin/release_pip.sh @@ -11,17 +11,10 @@ set -o pipefail IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" cd "$REPO_DIR" source "$REPO_DIR/.venv/bin/activate" - -# apt install python3 python3-all python3-dev -# pip install '.[dev]' - - -echo "[^] Uploading to test.pypi.org" -python3 -m twine upload --repository testpypi pip_dist/archivebox-${VERSION}*.{whl,tar.gz} - -echo "[^] Uploading to pypi.org" -python3 -m twine upload --repository pypi pip_dist/archivebox-${VERSION}*.{whl,tar.gz} +echo "[^] Publishing to PyPI..." +rm -Rf dist +uv build +uv publish diff --git a/bin/setup.sh b/bin/setup.sh index 304c96c55c..1d540e6711 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -1,120 +1,218 @@ #!/usr/bin/env bash -# ArchiveBox Setup Script -# https://github.com/ArchiveBox/ArchiveBox +# ArchiveBox Setup Script (Ubuntu/Debian/FreeBSD/macOS) +# - Project Homepage: https://github.com/ArchiveBox/ArchiveBox +# - Install Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Install +# Script Usage: +# curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/dev/bin/setup.sh' | bash +# (aka https://docker-compose.archivebox.io) + +### Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +# set -x +# shopt -s nullglob +set -o errexit +set -o errtrace +set -o nounset +set -o pipefail +# IFS=$'\n' + +clear + +if [ $(id -u) -eq 0 ]; then + echo + echo "[X] You cannot run this script as root. You must run it as a non-root user with sudo ability." + echo " Create a new non-privileged user 'archivebox' if necessary." + echo " adduser archivebox && usermod -a archivebox -G sudo && su archivebox" + echo " https://www.digitalocean.com/community/tutorials/how-to-create-a-new-sudo-enabled-user-on-ubuntu-20-04-quickstart" + echo " https://www.vultr.com/docs/create-a-sudo-user-on-freebsd" + echo " Then re-run this script as the non-root user." + echo + exit 2 +fi + +if (which docker > /dev/null && docker pull archivebox/archivebox:latest); then + echo "[+] Initializing an ArchiveBox data folder at ~/archivebox/data using Docker Compose..." + mkdir -p ~/archivebox/data || exit 1 + cd ~/archivebox + if [ -f "./index.sqlite3" ]; then + mv -i ~/archivebox/* ~/archivebox/data/ + fi + curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/docker-compose.yml' > docker-compose.yml + mkdir -p ./etc + curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg' > ./etc/sonic.cfg + docker compose run --rm archivebox init --setup + echo + echo "[+] Starting ArchiveBox server using: docker compose up -d..." + docker compose up -d + sleep 7 + which open > /dev/null && open "http://127.0.0.1:8000" || true + echo + echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:" + echo " cd ~/archivebox" + echo " docker compose ps" + echo " docker compose down" + echo " docker compose pull" + echo " docker compose up" + echo " docker compose run archivebox manage createsuperuser" + echo " docker compose run archivebox add 'https://example.com'" + echo " docker compose run archivebox list" + echo " docker compose run archivebox help" + exit 0 +elif (which docker > /dev/null && docker pull archivebox/archivebox:latest); then + echo "[+] Initializing an ArchiveBox data folder at ~/archivebox/data using Docker..." + mkdir -p ~/archivebox/data || exit 1 + cd ~/archivebox + if [ -f "./index.sqlite3" ]; then + mv -i ~/archivebox/* ~/archivebox/data/ + fi + cd ./data + docker run -v "$PWD":/data -it --rm archivebox/archivebox:latest init --setup + echo + echo "[+] Starting ArchiveBox server using: docker run -d archivebox/archivebox..." + docker run -v "$PWD":/data -it -d -p 8000:8000 --name=archivebox archivebox/archivebox:latest + sleep 7 + which open > /dev/null && open "http://127.0.0.1:8000" || true + echo + echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:" + echo " cd ~/archivebox/data" + echo " docker ps --filter name=archivebox" + echo " docker kill archivebox" + echo " docker pull archivebox/archivebox" + echo " docker run -v $PWD:/data -d -p 8000:8000 --name=archivebox archivebox/archivebox" + echo " docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser" + echo " docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'" + echo " docker run -v $PWD:/data -it archivebox/archivebox list" + echo " docker run -v $PWD:/data -it archivebox/archivebox help" + exit 0 +fi + +echo +echo "[!] It's highly recommended to use ArchiveBox with Docker, but Docker wasn't found." +echo +echo " âš ī¸ If you want to use Docker, press [Ctrl-C] to cancel now. âš ī¸" +echo " Get Docker: https://docs.docker.com/get-docker/" +echo " After you've installed Docker, run this script again." +echo +echo "Otherwise, install will continue with apt/brew/pkg + pip in 12s... (press [Ctrl+C] to cancel)" +echo +sleep 12 || exit 1 +echo "Proceeding with system package manager..." +echo echo "[i] ArchiveBox Setup Script đŸ“Ļ" -echo "" -echo " This is a helper script which installs the ArchiveBox dependencies on your system using homebrew/aptitude." -echo " You may be prompted for a password in order to install the following:" -echo "" -echo " - python3, python3-pip, python3-distutils" -echo " - curl" -echo " - wget" -echo " - git" -echo " - youtube-dl" -echo " - chromium-browser (skip this if Chrome/Chromium is already installed)" -echo " - nodejs (used for singlefile, readability, mercury, and more)" -echo "" -echo " If you'd rather install these manually, you can find documentation here:" +echo +echo " This is a helper script which installs the ArchiveBox dependencies on your system using brew/apt/pip3." +echo " You may be prompted for a sudo password in order to install the following:" +echo +echo " - archivebox" +echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" +echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)" +echo " - chromium (skips this if any Chrome/Chromium version is already installed)" +echo +echo " If you'd rather install these manually as-needed, you can find detailed documentation here:" echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install" -echo "" -read -p "Press [enter] to continue with the automatic install, or Ctrl+C to cancel..." REPLY -echo "" +echo +echo "Continuing in 12s... (press [Ctrl+C] to cancel)" +echo +sleep 12 || exit 1 +echo "Proceeding to install dependencies..." +echo # On Linux: if which apt-get > /dev/null; then - echo "[+] Adding ArchiveBox apt repo to sources..." - sudo apt install software-properties-common - sudo add-apt-repository -u ppa:archivebox/archivebox - echo "[+] Installing python3, wget, curl..." - sudo apt install -y git python3 python3-pip python3-distutils wget curl youtube-dl nodejs npm ripgrep - # sudo apt install archivebox + echo "[+] Installing ArchiveBox system dependencies using apt..." + sudo apt-get update -qq + sudo apt-get install -y git python3 python3-pip python3-venv wget curl yt-dlp ffmpeg git nodejs npm ripgrep + sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true + echo + echo "[+] Installing ArchiveBox python dependencies using pip3..." + sudo python3 -m pip install --upgrade --ignore-installed archivebox yt-dlp +# On Mac: +elif which brew > /dev/null; then + echo "[+] Installing ArchiveBox using Homebrew..." + brew tap archivebox/archivebox + brew update + brew install archivebox +elif which pkg > /dev/null; then + echo "[+] Installing ArchiveBox system dependencies using pkg and pip (python3.9)..." + sudo pkg install -y python3 py39-pip py39-sqlite3 npm wget curl youtube_dl ffmpeg git ripgrep + sudo pkg install -y chromium + echo + echo "[+] Installing ArchiveBox python dependencies using pip..." + # don't use sudo here so that pip installs in $HOME/.local instead of into /usr/local + python3 -m pip install --upgrade --ignore-installed archivebox yt-dlp playwright +else + echo "[!] Warning: Could not find aptitude/homebrew/pkg! May not be able to install all dependencies automatically." + echo + echo " If you're on macOS, make sure you have homebrew installed: https://brew.sh/" + echo " If you're on Linux, only Ubuntu/Debian/BSD systems are officially supported with this script." + echo " If you're on Windows, this script is not officially supported (Docker is recommended instead)." + echo + echo "See the README.md for Manual Setup & Troubleshooting instructions if you you're unable to run ArchiveBox after this script completes." +fi - if which google-chrome; then - echo "[i] You already have google-chrome installed, if you would like to download chromium instead (they work pretty much the same), follow the Manual Setup instructions" - google-chrome --version - elif which chromium-browser; then - echo "[i] chromium-browser already installed, using existing installation." - chromium-browser --version - elif which chromium; then - echo "[i] chromium already installed, using existing installation." - chromium --version - else - echo "[+] Installing chromium..." - sudo apt install chromium || sudo apt install chromium-browser - fi +echo -# On Mac: -elif which brew > /dev/null; then # 🐍 eye of newt - echo "[+] Installing python3, wget, curl (ignore 'already installed' warnings)..." - brew install git wget curl youtube-dl ripgrep node - if which python3; then - if python3 -c 'import sys; raise SystemExit(sys.version_info < (3,5,0))'; then - echo "[√] Using existing $(which python3)..." - else - echo "[+] Installing python3..." - brew install python3 - fi - else - echo "[+] Installing python3..." - brew install python3 +if ! which archivebox > /dev/null 2>&1; then + # If archivebox isn't in PATH (e.g. pip install), check python modules directly + if ! (python3 --version && python3 -m pip --version && python3 -m django --version) 2>/dev/null; then + echo "[X] Python 3 pip was not found on your system!" + echo " You must first install Python >= 3.7 (and pip3):" + echo " https://www.python.org/downloads/" + echo " https://wiki.python.org/moin/BeginnersGuide/Download" + echo " After installing, run this script again." + exit 1 fi - if ls /Applications/Google\ Chrome*.app > /dev/null; then - echo "[√] Using existing /Applications/Google Chrome.app" - elif ls /Applications/Chromium.app; then - echo "[√] Using existing /Applications/Chromium.app" - elif which chromium-browser; then - echo "[√] Using existing $(which chromium-browser)" - elif which chromium; then - echo "[√] Using existing $(which chromium)" - else - echo "[+] Installing chromium..." - brew cask install chromium + if ! (python3 -m django --version && python3 -m pip show archivebox) 2>/dev/null; then + echo "[X] Django and ArchiveBox were not found after installing!" + echo " Check to see if a previous step failed." + echo + exit 1 fi -else - echo "[X] Could not find aptitude or homebrew! â€ŧī¸" - echo "" - echo " If you're on macOS, make sure you have homebrew installed: https://brew.sh/" - echo " If you're on Ubuntu/Debian, make sure you have apt installed: https://help.ubuntu.com/lts/serverguide/apt.html" - echo " (those are the only currently supported systems for the automatic setup script)" - echo "" - echo "See the README.md for Manual Setup & Troubleshooting instructions." - exit 1 fi -npm i -g npm -pip3 install --upgrade pip setuptools +if ! which archivebox > /dev/null 2>&1; then + echo "[X] archivebox command was not found in PATH after installing!" + echo " Check to see if a previous step failed." + exit 1 +fi -pip3 install --upgrade archivebox -npm install -g 'git+https://github.com/ArchiveBox/ArchiveBox.git' +# echo +# echo "[+] Upgrading npm and pip..." +# sudo npm i -g npm || true +# sudo python3 -m pip install --upgrade pip setuptools || true -# Check: -echo "" -echo "[*] Checking installed versions:" -echo "---------------------------------------------------" -which python3 && -python3 --version | head -n 1 && -echo "" && -which git && -git --version | head -n 1 && -echo "" && -which wget && -wget --version | head -n 1 && -echo "" && -which curl && -curl --version | head -n 1 && -echo "" && -which youtube-dl && -youtube-dl --version | head -n 1 && -echo "---------------------------------------------------" && -archivebox version && -echo "[√] All dependencies installed. ✅" && -exit 0 +echo +echo "[+] Installing Chromium binary using playwright..." +python3 -m playwright install --with-deps chromium || true +echo -echo "---------------------------------------------------" -echo "[X] Failed to install some dependencies! â€ŧī¸" -echo " - Try the Manual Setup instructions in the README.md" -echo " - Try the Troubleshooting: Dependencies instructions in the README.md" -echo " - Open an issue on github to get help: https://github.com/ArchiveBox/ArchiveBox/issues" -exit 1 +echo +echo "[+] Initializing ArchiveBox data folder at ~/archivebox/data..." +mkdir -p ~/archivebox/data || exit 1 +cd ~/archivebox +if [ -f "./index.sqlite3" ]; then + mv -i ~/archivebox/* ~/archivebox/data/ +fi +cd ./data +: | python3 -m archivebox init --setup || true # pipe in empty command to make sure stdin is closed +# init shows version output at the end too +echo +echo "[+] Starting ArchiveBox server using: nohup archivebox server &..." +nohup python3 -m archivebox server 0.0.0.0:8000 > ./logs/server.log 2>&1 & +sleep 7 +which open > /dev/null && open "http://127.0.0.1:8000" || true +echo +echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized in ~/archivebox/data. Usage:" +echo " cd ~/archivebox/data # see your data dir" +echo " archivebox server --quick-init 0.0.0.0:8000 # start server process" +echo " archivebox manage createsuperuser # add an admin user+pass" +echo " ps aux | grep archivebox # see server process pid" +echo " pkill -f archivebox # stop the server" +echo " pip install --upgrade archivebox; archivebox init # update versions" +echo " archivebox add 'https://example.com'" # archive a new URL +echo " archivebox list # see URLs archived" +echo " archivebox help # see more help & examples" diff --git a/bin/setup_monorepo.sh b/bin/setup_monorepo.sh new file mode 100755 index 0000000000..535b123710 --- /dev/null +++ b/bin/setup_monorepo.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" +GITHUB_BASE="${GITHUB_BASE:-https://github.com/ArchiveBox}" +MONOREPO_REMOTE="${MONOREPO_REMOTE:-$GITHUB_BASE/monorepo.git}" +REPO_NAMES=(abxbus abx-pkg abx-plugins abx-dl archivebox) + +is_member_repo() { + local repo_root="$1" + local repo_name + + for repo_name in "${REPO_NAMES[@]}"; do + if [[ "$(basename "$repo_root")" == "$repo_name" ]]; then + return 0 + fi + done + + return 1 +} + +monorepo_remote_matches() { + case "$1" in + git@github.com:ArchiveBox/monorepo.git | \ + git+ssh://git@github.com/ArchiveBox/monorepo.git | \ + https://github.com/ArchiveBox/monorepo.git) + return 0 + ;; + *) + return 1 + ;; + esac +} + +warn() { + printf 'Warning: %s\n' "$1" >&2 +} + +have_ldap_build_deps() { + if command -v dpkg-query >/dev/null 2>&1; then + dpkg-query -W -f='${Status}' libldap2-dev 2>/dev/null | grep -q 'install ok installed' && return 0 + fi + + if command -v brew >/dev/null 2>&1; then + brew --prefix openldap >/dev/null 2>&1 && return 0 + fi + + return 1 +} + +ensure_ldap_build_deps() { + if have_ldap_build_deps; then + return + fi + + printf 'Ensuring LDAP build dependencies (best effort)\n' + + if command -v apt >/dev/null 2>&1 && sudo -n apt install -y libldap2-dev >/dev/null 2>&1; then + return + fi + + if command -v brew >/dev/null 2>&1 && brew install openldap >/dev/null 2>&1; then + return + fi + + warn "Could not auto-install LDAP build dependencies; continuing. If you need archivebox[ldap], run: sudo apt install libldap2-dev || brew install openldap" +} + +sync_workspace() { + if uv sync --all-packages --all-extras --no-cache --active; then + return + fi + + warn "'uv sync --all-packages --all-extras --no-cache --active' failed; retrying without --all-extras" + uv sync --all-packages --no-cache --active +} + +ensure_setup_link() { + local repo_name="$1" + local repo_dir="$ROOT_DIR/$repo_name" + local link_path="$repo_dir/bin/setup_monorepo.sh" + local source_path="$ROOT_DIR/bin/setup.sh" + + mkdir -p "$repo_dir/bin" + + if [[ -e "$link_path" ]] && [[ "$source_path" -ef "$link_path" ]]; then + return + fi + + if [[ -d "$link_path" && ! -L "$link_path" ]]; then + printf 'Refusing to replace directory: %s\n' "$link_path" >&2 + exit 1 + fi + + rm -f "$link_path" + ln "$source_path" "$link_path" +} + +bootstrap_monorepo_root() { + local monorepo_root="$1" + local origin_url="" + + if [[ -d "$monorepo_root/.git" ]]; then + origin_url="$(git -C "$monorepo_root" remote get-url origin 2>/dev/null || true)" + + if [[ -n "$origin_url" ]] && ! monorepo_remote_matches "$origin_url"; then + printf 'Refusing to reuse existing git repo at %s (origin: %s)\n' "$monorepo_root" "$origin_url" >&2 + exit 1 + fi + + if [[ -z "$origin_url" ]]; then + git -C "$monorepo_root" remote add origin "$MONOREPO_REMOTE" + fi + + printf 'Updating monorepo root: %s\n' "$monorepo_root" + if git -C "$monorepo_root" -c pull.rebase=false pull --ff-only --quiet >/dev/null 2>&1; then + printf 'Updated monorepo root\n' + else + printf 'Skipping monorepo pull (local changes, divergent branch, detached HEAD, or no upstream)\n' >&2 + fi + return + fi + + printf 'Bootstrapping monorepo root in %s\n' "$monorepo_root" + git -C "$monorepo_root" init -b main >/dev/null + git -C "$monorepo_root" remote add origin "$MONOREPO_REMOTE" + git -C "$monorepo_root" fetch --depth=1 origin main --quiet + + if git -C "$monorepo_root" checkout -B main --track origin/main >/dev/null 2>&1; then + printf 'Initialized monorepo root\n' + else + printf 'Failed to materialize monorepo root in %s; existing files likely conflict with tracked monorepo files\n' "$monorepo_root" >&2 + exit 1 + fi +} + +if is_member_repo "$SCRIPT_REPO_ROOT"; then + ROOT_DIR="$(cd -- "$SCRIPT_REPO_ROOT/.." && pwd)" + bootstrap_monorepo_root "$ROOT_DIR" +elif [[ -f "$SCRIPT_REPO_ROOT/pyproject.toml" ]]; then + ROOT_DIR="$SCRIPT_REPO_ROOT" +else + printf 'Unable to infer monorepo root from script location: %s\n' "$SCRIPT_DIR" >&2 + exit 1 +fi + +ensure_member_repo() { + local repo_name="$1" + local repo_dir="$ROOT_DIR/$repo_name" + + if [[ -d "$repo_dir/.git" ]]; then + printf 'Updating existing checkout: %s\n' "$repo_name" + if git -C "$repo_dir" -c pull.rebase=false pull --ff-only --quiet >/dev/null 2>&1; then + printf 'Updated: %s\n' "$repo_name" + else + printf 'Skipping pull for %s (local changes, divergent branch, detached HEAD, or no upstream)\n' "$repo_name" >&2 + fi + return + fi + + if [[ -e "$repo_dir" ]]; then + printf 'Refusing to overwrite existing path: %s\n' "$repo_dir" >&2 + exit 1 + fi + + printf 'Cloning %s/%s.git -> %s\n' "$GITHUB_BASE" "$repo_name" "$repo_name" + git clone "$GITHUB_BASE/$repo_name.git" "$repo_dir" +} + +for repo_name in "${REPO_NAMES[@]}"; do + ensure_member_repo "$repo_name" +done + +for repo_name in "${REPO_NAMES[@]}"; do + ensure_setup_link "$repo_name" +done + +cd "$ROOT_DIR" +deactivate || true +rm -Rf ./*/.venv # delete all sub-repo venvs, the monorepo venv needs to take precedence + +uv venv --allow-existing "$ROOT_DIR/.venv" +# shellcheck disable=SC1091 +source "$ROOT_DIR/.venv/bin/activate" +ensure_ldap_build_deps +sync_workspace +echo +echo +echo "[√] Monorepo setup complete, cloned and pulled: ${REPO_NAMES[*]}" +echo " MONOREPO_ROOT=$ROOT_DIR" +echo " VIRTUAL_ENV=$VIRTUAL_ENV" +echo " PYTHON_BIN=$VIRTUAL_ENV/bin/python" +echo " NODE_BIN=$(which node)" +echo +echo "TIPS:" +echo " - Always use 'uv run ...' within each subrepo, never in the root & never run 'python ...' directly" +echo " - Always read $ROOT_DIR/README.md into context before starting any work" diff --git a/bin/test.sh b/bin/test.sh index f9ea35750b..7567a56cfb 100755 --- a/bin/test.sh +++ b/bin/test.sh @@ -14,4 +14,5 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" source "$DIR/.venv/bin/activate" -pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist +pytest -s --basetemp=archivebox/tests/data "$@" +exec ./bin/test_plugins.sh diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh new file mode 100755 index 0000000000..2cef86e005 --- /dev/null +++ b/bin/test_plugins.sh @@ -0,0 +1,382 @@ +#!/bin/bash +# Run ArchiveBox plugin tests with coverage +# +# All plugin tests use pytest and are located in pluginname/tests/test_*.py +# +# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] [--coverage-report] +# +# Examples: +# ./bin/test_plugins.sh # Run all plugin tests with coverage +# ./bin/test_plugins.sh chrome # Run chrome plugin tests with coverage +# ./bin/test_plugins.sh parse_* # Run all parse_* plugin tests with coverage +# ./bin/test_plugins.sh --no-coverage # Run all tests without coverage +# ./bin/test_plugins.sh --coverage-report # Just show coverage report without running tests +# +# For running individual hooks with coverage: +# NODE_V8_COVERAGE=./coverage/js node .js [args] # JS hooks +# coverage run --parallel-mode .py [args] # Python hooks +# +# Coverage results are saved to .coverage (Python) and coverage/js (JavaScript): +# coverage combine && coverage report +# coverage json +# ./bin/test_plugins.sh --coverage-report + +set -euo pipefail + +# Color codes +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Save root directory first +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +PLUGINS_DIR="${ABX_PLUGINS_DIR:-$(python3 -c 'from abx_plugins import get_plugins_dir; print(get_plugins_dir())')}" + +# Parse arguments +PLUGIN_FILTER="" +ENABLE_COVERAGE=true +COVERAGE_REPORT_ONLY=false + +for arg in "$@"; do + if [ "$arg" = "--no-coverage" ]; then + ENABLE_COVERAGE=false + elif [ "$arg" = "--coverage-report" ]; then + COVERAGE_REPORT_ONLY=true + else + PLUGIN_FILTER="$arg" + fi +done + +# Read secret-like config properties from a plugin's standardized config.json. +# Each output line is a pipe-delimited env alias group where any populated alias +# satisfies the requirement, e.g. TWOCAPTCHA_API_KEY|API_KEY_2CAPTCHA. +get_plugin_secret_groups() { + local plugin_dir="$1" + local config_json="$plugin_dir/config.json" + + if [ ! -f "$config_json" ]; then + return 0 + fi + + python3 - "$config_json" <<'PY' +import json +import re +import sys +from pathlib import Path + +config_path = Path(sys.argv[1]) +try: + config = json.loads(config_path.read_text()) +except Exception: + sys.exit(0) + +properties = config.get("properties", {}) +for env_name, schema in properties.items(): + default = schema.get("default") + aliases = [alias for alias in schema.get("x-aliases", []) if alias] + looks_secret = bool(schema.get("x-sensitive")) or bool(re.search(r"(API_KEY|TOKEN|SECRET)", env_name)) + if schema.get("type") == "string" and looks_secret and default in ("", None): + print("|".join([env_name, *aliases])) +PY +} + +# Function to show JS coverage report (inlined from convert_v8_coverage.js) +show_js_coverage() { + local plugin_root="$1" + local coverage_dir="$2" + + if [ ! -d "$coverage_dir" ] || [ -z "$(ls -A "$coverage_dir" 2>/dev/null)" ]; then + echo "No JavaScript coverage data collected" + echo "(JS hooks may not have been executed during tests)" + return + fi + + node - "$plugin_root" "$coverage_dir" << 'ENDJS' +const fs = require('fs'); +const path = require('path'); +const pluginRoot = path.resolve(process.argv[2]).replace(/\\/g, '/'); +const coverageDir = process.argv[3]; + +const files = fs.readdirSync(coverageDir).filter(f => f.startsWith('coverage-') && f.endsWith('.json')); +if (files.length === 0) { + console.log('No coverage files found'); + process.exit(0); +} + +const coverageByFile = {}; + +files.forEach(file => { + const data = JSON.parse(fs.readFileSync(path.join(coverageDir, file), 'utf8')); + data.result.forEach(script => { + const url = script.url; + if (url.startsWith('node:') || url.includes('node_modules')) return; + + if (!coverageByFile[url]) { + coverageByFile[url] = { totalRanges: 0, executedRanges: 0 }; + } + + script.functions.forEach(func => { + func.ranges.forEach(range => { + coverageByFile[url].totalRanges++; + if (range.count > 0) coverageByFile[url].executedRanges++; + }); + }); + }); +}); + +const allFiles = Object.keys(coverageByFile).sort(); +const pluginFiles = allFiles.filter(url => url.replace(/\\/g, '/').includes(pluginRoot)); +const otherFiles = allFiles.filter(url => !url.startsWith('node:') && !url.replace(/\\/g, '/').includes(pluginRoot)); + +console.log('Total files with coverage: ' + allFiles.length + '\n'); +console.log('Plugin files: ' + pluginFiles.length); +console.log('Node internal: ' + allFiles.filter(u => u.startsWith('node:')).length); +console.log('Other: ' + otherFiles.length + '\n'); + +console.log('JavaScript Coverage Report'); +console.log('='.repeat(80)); +console.log(''); + +if (otherFiles.length > 0) { + console.log('Non-plugin files with coverage:'); + otherFiles.forEach(url => console.log(' ' + url)); + console.log(''); +} + +if (pluginFiles.length === 0) { + console.log('No plugin files covered'); + process.exit(0); +} + +let totalRanges = 0, totalExecuted = 0; + +pluginFiles.forEach(url => { + const cov = coverageByFile[url]; + const pct = cov.totalRanges > 0 ? (cov.executedRanges / cov.totalRanges * 100).toFixed(1) : '0.0'; + const normalizedUrl = url.replace(/\\/g, '/'); + const displayPath = normalizedUrl.includes(pluginRoot) ? normalizedUrl.slice(normalizedUrl.indexOf(pluginRoot)) : url; + console.log(displayPath + ': ' + pct + '% (' + cov.executedRanges + '/' + cov.totalRanges + ' ranges)'); + totalRanges += cov.totalRanges; + totalExecuted += cov.executedRanges; +}); + +console.log(''); +console.log('-'.repeat(80)); +const overallPct = totalRanges > 0 ? (totalExecuted / totalRanges * 100).toFixed(1) : '0.0'; +console.log('Total: ' + overallPct + '% (' + totalExecuted + '/' + totalRanges + ' ranges)'); +ENDJS +} + +# If --coverage-report only, just show the report and exit +if [ "$COVERAGE_REPORT_ONLY" = true ]; then + cd "$ROOT_DIR" || exit 1 + echo "==========================================" + echo "Python Coverage Summary" + echo "==========================================" + coverage combine 2>/dev/null || true + coverage report --include="*/abx_plugins/plugins/*" --omit="*/tests/*" + echo "" + + echo "==========================================" + echo "JavaScript Coverage Summary" + echo "==========================================" + show_js_coverage "$PLUGINS_DIR" "$ROOT_DIR/coverage/js" + echo "" + + echo "For detailed coverage reports:" + echo " Python: coverage report --show-missing --include='*/abx_plugins/plugins/*' --omit='*/tests/*'" + echo " Python: coverage json # LLM-friendly format" + echo " Python: coverage html # Interactive HTML report" + exit 0 +fi + +# Set DATA_DIR for tests (required by abx_pkg and plugins) +# Use temp dir to isolate tests from project files +if [ -z "${DATA_DIR:-}" ]; then + export DATA_DIR=$(mktemp -d -t archivebox_plugin_tests.XXXXXX) + # Clean up on exit + trap "rm -rf '$DATA_DIR'" EXIT +fi + +# Reset coverage data if collecting coverage +if [ "$ENABLE_COVERAGE" = true ]; then + echo "Resetting coverage data..." + cd "$ROOT_DIR" || exit 1 + coverage erase + rm -rf "$ROOT_DIR/coverage/js" 2>/dev/null + mkdir -p "$ROOT_DIR/coverage/js" + + # Enable Python subprocess coverage + export COVERAGE_PROCESS_START="$ROOT_DIR/pyproject.toml" + export PYTHONPATH="$ROOT_DIR${PYTHONPATH:+:$PYTHONPATH}" # For sitecustomize.py + + # Enable Node.js V8 coverage (built-in, no packages needed) + export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js" + + echo "Python coverage: enabled (subprocess support)" + echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)" + echo "" +fi + +cd "$ROOT_DIR" || exit 1 + +echo "==========================================" +echo "ArchiveBox Plugin Tests" +echo "==========================================" +echo "" + +if [ -n "$PLUGIN_FILTER" ]; then + echo "Filter: $PLUGIN_FILTER" +else + echo "Running all plugin tests" +fi + +if [ "$ENABLE_COVERAGE" = true ]; then + echo "Coverage: enabled" +else + echo "Coverage: disabled" +fi +echo "" + +# Track results +TOTAL_PLUGINS=0 +PASSED_PLUGINS=0 +FAILED_PLUGINS=0 +UNAVAILABLE_PLUGINS=0 + +# Find and run plugin tests +if [ -n "$PLUGIN_FILTER" ]; then + # Run tests for specific plugin(s) matching pattern + TEST_DIRS=$(find "$PLUGINS_DIR" -maxdepth 2 -type d -path "$PLUGINS_DIR/${PLUGIN_FILTER}*/tests" 2>/dev/null | sort) +else + # Run all plugin tests + TEST_DIRS=$(find "$PLUGINS_DIR" -maxdepth 2 -type d -name "tests" -path "$PLUGINS_DIR/*/tests" 2>/dev/null | sort) +fi + +if [ -z "$TEST_DIRS" ]; then + echo -e "${YELLOW}No plugin tests found${NC}" + [ -n "$PLUGIN_FILTER" ] && echo "Pattern: $PLUGIN_FILTER" + exit 0 +fi + +for test_dir in $TEST_DIRS; do + # Check if there are any Python test files + if ! compgen -G "${test_dir}/test_*.py" > /dev/null 2>&1; then + continue + fi + + plugin_name=$(basename "$(dirname "$test_dir")") + plugin_dir=$(dirname "$test_dir") + TOTAL_PLUGINS=$((TOTAL_PLUGINS + 1)) + + # New plugin packages can include live integration suites that require API + # credentials. Only run those suites when the standardized config.json + # secrets are actually available in the current environment. + missing_secret_groups=() + while IFS= read -r secret_group; do + [ -z "$secret_group" ] && continue + + secret_available=false + IFS='|' read -r -a secret_names <<< "$secret_group" + for secret_name in "${secret_names[@]}"; do + if [ -n "${!secret_name:-}" ]; then + secret_available=true + break + fi + done + + if [ "$secret_available" = false ]; then + missing_secret_groups+=("$secret_group") + fi + done < <(get_plugin_secret_groups "$plugin_dir") + + if [ ${#missing_secret_groups[@]} -gt 0 ]; then + echo -e "${YELLOW}[UNAVAILABLE]${NC} $plugin_name" + printf 'Missing secret env for full suite: %s\n' "${missing_secret_groups[*]}" + UNAVAILABLE_PLUGINS=$((UNAVAILABLE_PLUGINS + 1)) + echo "" + continue + fi + + echo -e "${YELLOW}[RUNNING]${NC} $plugin_name" + + # Build pytest command with optional coverage + PYTEST_CMD=(python -m pytest "$test_dir" -p no:django -v --tb=short) + if [ "$ENABLE_COVERAGE" = true ]; then + PYTEST_CMD+=(--cov="$(dirname "$test_dir")" --cov-append --cov-branch) + echo "[DEBUG] NODE_V8_COVERAGE before pytest: $NODE_V8_COVERAGE" + python -c "import os; print('[DEBUG BASH->PYTHON] NODE_V8_COVERAGE:', os.environ.get('NODE_V8_COVERAGE', 'NOT_SET'))" + fi + + LOG_FILE=$(mktemp -t "archivebox_plugin_${plugin_name}.XXXXXX.log") + PLUGIN_TMPDIR=$(mktemp -d -t "archivebox_plugin_${plugin_name}.XXXXXX") + if ( + cd "$PLUGIN_TMPDIR" + TMPDIR="$PLUGIN_TMPDIR" "${PYTEST_CMD[@]}" + ) >"$LOG_FILE" 2>&1; then + grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" "$LOG_FILE" | tail -100 + echo -e "${GREEN}[PASSED]${NC} $plugin_name" + PASSED_PLUGINS=$((PASSED_PLUGINS + 1)) + else + grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" "$LOG_FILE" | tail -100 + echo -e "${RED}[FAILED]${NC} $plugin_name" + FAILED_PLUGINS=$((FAILED_PLUGINS + 1)) + fi + rm -f "$LOG_FILE" + rm -rf "$PLUGIN_TMPDIR" + echo "" +done + +# Print summary +echo "==========================================" +echo "Test Summary" +echo "==========================================" +echo -e "Total plugins tested: $TOTAL_PLUGINS" +echo -e "${GREEN}Passed:${NC} $PASSED_PLUGINS" +echo -e "${RED}Failed:${NC} $FAILED_PLUGINS" +echo -e "${YELLOW}Unavailable:${NC} $UNAVAILABLE_PLUGINS" +echo "" + +if [ $TOTAL_PLUGINS -eq 0 ]; then + echo -e "${YELLOW}⚠ No tests found${NC}" + exit 0 +elif [ $FAILED_PLUGINS -eq 0 ]; then + if [ $UNAVAILABLE_PLUGINS -eq 0 ]; then + echo -e "${GREEN}✓ All plugin tests passed!${NC}" + else + echo -e "${GREEN}✓ All runnable plugin tests passed!${NC}" + echo -e "${YELLOW}⚠ Some plugin suites were unavailable in this environment${NC}" + fi + + # Show coverage summary if enabled + if [ "$ENABLE_COVERAGE" = true ]; then + echo "" + echo "==========================================" + echo "Python Coverage Summary" + echo "==========================================" + # Coverage data is in ROOT_DIR, combine and report from there + cd "$ROOT_DIR" || exit 1 + # Copy coverage data from plugins dir if it exists + coverage combine 2>/dev/null || true + coverage report --include="*/abx_plugins/plugins/*" --omit="*/tests/*" 2>&1 | head -50 + echo "" + + echo "==========================================" + echo "JavaScript Coverage Summary" + echo "==========================================" + show_js_coverage "$PLUGINS_DIR" "$ROOT_DIR/coverage/js" + echo "" + + echo "For detailed coverage reports (from project root):" + echo " Python: coverage report --show-missing --include='*/abx_plugins/plugins/*' --omit='*/tests/*'" + echo " Python: coverage json # LLM-friendly format" + echo " Python: coverage html # Interactive HTML report" + echo " JavaScript: ./bin/test_plugins.sh --coverage-report" + fi + + exit 0 +else + echo -e "${RED}✗ Some plugin tests failed${NC}" + exit 1 +fi diff --git a/brew_dist b/brew_dist deleted file mode 160000 index 95a1c1a087..0000000000 --- a/brew_dist +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 95a1c1a0875841d076f06106bd4c2307504928c2 diff --git a/brew_dist/archivebox.rb b/brew_dist/archivebox.rb new file mode 100644 index 0000000000..3e4d2f3113 --- /dev/null +++ b/brew_dist/archivebox.rb @@ -0,0 +1,50 @@ +# Auto-generated by bin/build_brew.sh using homebrew-pypi-poet. +# Users install with: brew tap archivebox/archivebox && brew install archivebox + +class Archivebox < Formula + include Language::Python::Virtualenv + + desc "Self-hosted internet archiving solution" + homepage "https://github.com/ArchiveBox/ArchiveBox" + url "https://files.pythonhosted.org/packages/source/a/archivebox/archivebox-0.9.3.tar.gz" + sha256 "" # auto-filled by bin/build_brew.sh + license "MIT" + head "https://github.com/ArchiveBox/ArchiveBox.git", branch: "dev" + + depends_on "python@3.13" + # All other runtime deps (node, chrome, yt-dlp, etc.) are installed + # on-demand by `archivebox install` and should NOT be declared here. + + # Python dependency resource blocks auto-generated by homebrew-pypi-poet + # AUTOGENERATED_RESOURCES_START + # AUTOGENERATED_RESOURCES_END + + def install + virtualenv_install_with_resources + end + + def post_install + # Initialize ArchiveBox data in the Homebrew-managed var directory + data_dir = var/"archivebox" + data_dir.mkpath + ENV["DATA_DIR"] = data_dir.to_s + system bin/"archivebox", "init" + end + + def caveats + <<~EOS + ArchiveBox data is stored in: + #{var}/archivebox + + To start archiving, run: + cd #{var}/archivebox && archivebox add 'https://example.com' + + To start the web UI: + cd #{var}/archivebox && archivebox server 0.0.0.0:8000 + EOS + end + + test do + assert_match version.to_s, shell_output("#{bin}/archivebox version") + end +end diff --git a/deb_dist b/deb_dist deleted file mode 160000 index f8e3a0247c..0000000000 --- a/deb_dist +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f8e3a0247c09a2f9aaea2848ee7da9c486e14669 diff --git a/docker b/docker deleted file mode 160000 index 236f7881e3..0000000000 --- a/docker +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 236f7881e3105b218864d9b3185b17c44b306106 diff --git a/docker-compose.yml b/docker-compose.yml index 3b2959d549..f7066fb336 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,72 +1,133 @@ # Usage: -# docker-compose run archivebox init --setup -# docker-compose up -# echo "https://example.com" | docker-compose run archivebox archivebox add -# docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss -# docker-compose run archivebox config --set PUBLIC_INDEX=True -# docker-compose run archivebox help +# mkdir -p ~/archivebox/data && cd ~/archivebox +# curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml +# docker compose run archivebox init --install +# docker compose run archivebox add --depth=1 'https://news.ycombinator.com' +# docker compose run -T archivebox add < bookmarks.txt +# docker compose up -d && open 'http://web.archivebox.localhost:8000' +# docker compose run archivebox help # Documentation: # https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose -version: '2.4' - services: archivebox: - # build: . # for developers working on archivebox - image: ${DOCKER_IMAGE:-archivebox/archivebox:master} - command: server --quick-init 0.0.0.0:8000 + image: archivebox/archivebox:latest ports: - 8000:8000 - environment: - - ALLOWED_HOSTS=* # add any config options you want as env vars - - MEDIA_MAX_SIZE=750m - # - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below - # - SEARCH_BACKEND_HOST_NAME=sonic - # - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data - # - ./archivebox:/app/archivebox # for developers working on archivebox + # ./data/personas/Default/chrome_profile/Default:/data/personas/Default/chrome_profile/Default + environment: + # - ADMIN_USERNAME=admin # creates an admin user on first run with the given user/pass combo + # - ADMIN_PASSWORD=SomeSecretPassword + - LISTEN_HOST=archivebox.localhost:8000 + - ALLOWED_HOSTS=* # set this to the hostname(s) you're going to serve the site from! + - CSRF_TRUSTED_ORIGINS=http://admin.archivebox.localhost:8000 # MUST match the admin UI URL for login/API to work + - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list + - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content + - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive + - SEARCH_BACKEND_ENGINE=sonic # tells ArchiveBox to use its built-in Sonic worker for fast full-text search + # - SEARCH_BACKEND_HOST_NAME=127.0.0.1 + - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + # - PUID=911 # set to your host user's UID & GID if you encounter permissions issues + # - PGID=911 # UID/GIDs lower than 500 may clash with system uids and are not recommended + # For options below, it's better to set in data/ArchiveBox.conf or use `docker compose run archivebox config --set SOME_KEY=someval` instead of setting here: + # - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out + # - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs) + # - USER_AGENT="..." # set a custom USER_AGENT to avoid being blocked as a bot + # ... + # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration + + # For ad-blocking during archiving, uncomment this section and the pihole service below + # networks: + # - dns + # dns: + # - 172.20.0.53 + + + ######## Optional Addons: tweak examples below as needed for your specific use case ######## + + ### `archivebox server` now runs the orchestrator itself, so scheduled crawls and queued UI/API jobs + # are processed by the main container without needing a separate scheduler sidecar. To add a new job: + # $ docker compose run archivebox schedule --add --every=day --depth=1 'https://example.com/some/rss/feed.xml' + # the running server orchestrator will pick it up automatically at the next due time. + # https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving + + + ### ArchiveBox now starts and uses Sonic automatically when SEARCH_BACKEND_ENGINE=sonic, + # so the old standalone docker sidecar below is no longer necessary. + # If Sonic is ever started after not running for a while, update its full-text index by running: + # $ docker compose run archivebox update --index-only + # https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search - # To run the Sonic full-text search backend, first download the config file to sonic.cfg - # curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg - # after starting, backfill any existing Snapshots into the index: docker-compose run archivebox update --index-only # sonic: - # image: valeriansaliou/sonic:v1.3.0 - # expose: - # - 1491 - # environment: - # - SEARCH_BACKEND_PASSWORD=SecretPassword - # volumes: - # - ./sonic.cfg:/etc/sonic.cfg:ro - # - ./data/sonic:/var/lib/sonic/store - - - ### Optional Addons: tweak these examples as needed for your specific use case - - # Example: Run scheduled imports in a docker instead of using cron on the - # host machine, add tasks and see more info with archivebox schedule --help - # scheduler: - # image: archivebox/archivebox:latest - # command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all' - # environment: - # - USE_COLOR=True - # - SHOW_PROGRESS=False - # volumes: - # - ./data:/data - - # Example: Put Nginx in front of the ArchiveBox server for SSL termination + # image: archivebox/sonic:latest + # expose: + # - 1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + # volumes: + # #- ./sonic.cfg:/etc/sonic.cfg:ro # mount to customize: https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg + # - ./data/sonic:/var/lib/sonic/store + + + ### This optional container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things, + # or remote control it to set up a chrome profile w/ login credentials for sites you want to archive. + # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile + # https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#docker-vnc-setup + + novnc: + image: theasp/novnc:latest + environment: + - DISPLAY_WIDTH=1920 + - DISPLAY_HEIGHT=1080 + - RUN_XTERM=no + ports: + # to view/control ArchiveBox's browser, visit: http://127.0.0.1:8080/vnc.html + # restricted to access from localhost by default because it has no authentication + - 127.0.0.1:8080:8080 + + + ### Example: Put Nginx in front of the ArchiveBox server for SSL termination and static file serving. + # You can also any other ingress provider for SSL like Apache, Caddy, Traefik, Cloudflare Tunnels, etc. + # nginx: # image: nginx:alpine # ports: # - 443:443 # - 80:80 # volumes: - # - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf + # - ./etc/nginx.conf:/etc/nginx/nginx.conf # - ./data:/var/www - # Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel + + ### Example: To run pihole in order to block ad/tracker requests during archiving, + # uncomment this optional block and set up pihole using its admin interface + + # pihole: + # image: pihole/pihole:latest + # ports: + # # access the admin HTTP interface on http://localhost:8090 + # - 127.0.0.1:8090:80 + # environment: + # - WEBPASSWORD=SET_THIS_TO_SOME_SECRET_PASSWORD_FOR_ADMIN_DASHBOARD + # - DNSMASQ_LISTENING=all + # dns: + # - 127.0.0.1 + # - 1.1.1.1 + # networks: + # dns: + # ipv4_address: 172.20.0.53 + # volumes: + # - ./etc/pihole:/etc/pihole + # - ./etc/dnsmasq:/etc/dnsmasq.d + + + ### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel to avoid IP blocks. + # You can also use any other VPN that works at the docker/IP level, e.g. Tailscale, OpenVPN, etc. + # wireguard: - # image: linuxserver/wireguard + # image: linuxserver/wireguard:latest # network_mode: 'service:archivebox' # cap_add: # - NET_ADMIN @@ -78,14 +139,57 @@ services: # - /lib/modules:/lib/modules # - ./wireguard.conf:/config/wg0.conf:ro - # Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox + ### Example: Run ChangeDetection.io to watch for changes to websites, then trigger ArchiveBox to archive them + # Documentation: https://github.com/dgtlmoon/changedetection.io + # More info: https://github.com/dgtlmoon/changedetection.io/blob/master/docker-compose.yml + + # changedetection: + # image: ghcr.io/dgtlmoon/changedetection.io + # volumes: + # - ./data-changedetection:/datastore + + + ### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox + # pywb: # image: webrecorder/pywb:latest - # entrypoint: /bin/sh 'wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback --proxy;' + # entrypoint: /bin/sh -c '(wb-manager init default || test $$? -eq 2) && wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback;' # environment: # - INIT_COLLECTION=archivebox # ports: - # - 8080:8080 + # - 8686:8080 # volumes: - # ./data:/archivebox - # ./data/wayback:/webarchive + # - ./data:/archivebox + # - ./data/wayback:/webarchive + + +networks: + # network just used for pihole container to offer :53 dns resolving on fixed ip for archivebox container + dns: + ipam: + driver: default + config: + - subnet: 172.20.0.0/24 + + +# HOW TO: Set up cloud storage for your ./data/archive (e.g. Amazon S3, Backblaze B2, Google Drive, OneDrive, SFTP, etc.) +# https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage +# +# Follow the steps here to set up the Docker RClone Plugin https://rclone.org/docker/ +# $ docker plugin install rclone/docker-volume-rclone:amd64 --grant-all-permissions --alias rclone +# $ nano /var/lib/docker-plugins/rclone/config/rclone.conf +# [examplegdrive] +# type = drive +# scope = drive +# drive_id = 1234567... +# root_folder_id = 0Abcd... +# token = {"access_token":...} + +# volumes: +# archive: +# driver: rclone +# driver_opts: +# remote: 'examplegdrive:archivebox' +# allow_other: 'true' +# vfs_cache_mode: full +# poll_interval: 0 diff --git a/docs b/docs index bfc5f76a61..7244076ece 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit bfc5f76a61faee5c0c04bae03fe2b88cff1c39c5 +Subproject commit 7244076ecec0264dddfba14930f5f8bfe4fb4ef0 diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 982a193151..3a8f6c7245 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -1,4 +1,4 @@ -# This is the example default configiration file for ArchiveBox. +# This is the example default configuration file for ArchiveBox. # # Copy lines from here into your project's ArchiveBox.conf file and uncomment, # DO NOT EDIT THIS FILE DIRECTLY! @@ -11,7 +11,7 @@ # RESTRICT_FILE_NAMES = windows # ONLY_NEW = False # TIMEOUT = 60 -# MEDIA_TIMEOUT = 3600 +# YTDLP_TIMEOUT = 3600 # URL_BLACKLIST = (://(.*\.)?facebook\.com)|(://(.*\.)?ebay\.com)|(.*\.exe$) # CHECK_SSL_VALIDITY = True # RESOLUTION = 1440,900 @@ -36,7 +36,7 @@ # SAVE_SCREENSHOT = True # SAVE_DOM = True # SAVE_GIT = True -# SAVE_MEDIA = False +# SAVE_YTDLP = False # SAVE_ARCHIVE_DOT_ORG = True @@ -55,7 +55,7 @@ # CURL_BINARY = curl # GIT_BINARY = git # WGET_BINARY = wget -# YOUTUBEDL_BINARY = youtube-dl +# YOUTUBEDL_BINARY = yt-dlp # CHROME_BINARY = chromium # CHROME_USER_DATA_DIR="~/.config/google-chrome/Default" diff --git a/etc/README.md b/etc/README.md index 1b9d7aa21a..1b7f0865cd 100644 --- a/etc/README.md +++ b/etc/README.md @@ -2,7 +2,7 @@ In this folder are some example config files you can use for setting up ArchiveBox on your machine. -E.g. see `etc/nginx` for an example nginx config to serve your archive, or `etc/cron` for an example cron job that crawls a feed every 24 hours. +E.g. see `nginx.conf` for an example nginx config to serve your archive with SSL, or `fly.toml` for an example deployment to the Fly.io hosting platform. Please contribute your etc files here! Example contributions diff --git a/etc/archivebox.service b/etc/archivebox.service new file mode 100644 index 0000000000..cd42f8518f --- /dev/null +++ b/etc/archivebox.service @@ -0,0 +1,29 @@ +# This is an example systemd service config definition for ArchiveBox. +# +# Link it into place on your system to use systemd to auto-start the ArchiveBox server on boot: +# https://unix.stackexchange.com/questions/224992/where-do-i-put-my-systemd-unit-file +# +# Review and change these lines as-needed for your specific environment and needs: +# WorkingDirectory, ExecStart, User, Group + +[Unit] +Description=Open source self-hosted web archiving +Documentation=https://github.com/ArchiveBox/ArchiveBox/wiki + +[Service] +Type=simple +WorkingDirectory=/home/archivebox/archivebox/ +ExecStart=/usr/local/bin/archivebox server 0.0.0.0:8000 +ExecReload=/bin/kill -s HUP $MAINPID +ExecStop=/bin/kill -s QUIT $MAINPID +Restart=always +RestartSec=2 +StandardOutput=syslog +StandardError=syslog +SyslogIdentifier=archivebox +User=archivebox +Group=archivebox + + +[Install] +WantedBy=multi-user.target diff --git a/etc/cron.d/ArchiveBox b/etc/cron.d/ArchiveBox deleted file mode 100644 index aa878a9777..0000000000 --- a/etc/cron.d/ArchiveBox +++ /dev/null @@ -1 +0,0 @@ -0 24 * * * www-data /opt/ArchiveBox/bin/archive "https://getpocket.com/users/example/feed/all" >> /var/log/ArchiveBox.log diff --git a/etc/fly.toml b/etc/fly.toml new file mode 100644 index 0000000000..1dec7cb51f --- /dev/null +++ b/etc/fly.toml @@ -0,0 +1,40 @@ +# fly.toml file generated for archivebox on 2021-04-23T16:35:11-04:00 + +app = "archivebox" + +kill_signal = "SIGINT" +kill_timeout = 5 + +[env] + +[mounts] +source="archivebox_data" +destination="/data" + +[experimental] + auto_rollback = true + +[[services]] + http_checks = [] + internal_port = 8000 + protocol = "tcp" + script_checks = [] + + [services.concurrency] + hard_limit = 25 + soft_limit = 20 + type = "connections" + + [[services.ports]] + handlers = ["http"] + port = 80 + + [[services.ports]] + handlers = ["tls", "http"] + port = 443 + + [[services.tcp_checks]] + grace_period = "1s" + interval = "15s" + restart_limit = 6 + timeout = "2s" diff --git a/etc/nginx/nginx.conf b/etc/nginx.conf similarity index 90% rename from etc/nginx/nginx.conf rename to etc/nginx.conf index 2fc55a2942..049366fe86 100644 --- a/etc/nginx/nginx.conf +++ b/etc/nginx.conf @@ -34,12 +34,13 @@ http { server { listen 80 default_server; server_name _; - - root /var/www; + index index.html; autoindex on; - try_files $uri $uri/ $uri.html =404; + + location /archive { + root /var/www/archive; + } } } - diff --git a/etc/package.json b/etc/package.json new file mode 100644 index 0000000000..473e4aa7fb --- /dev/null +++ b/etc/package.json @@ -0,0 +1,13 @@ +{ + "name": "archivebox", + "version": "0.0.1", + "repository": "github:ArchiveBox/ArchiveBox", + "license": "MIT", + "dependencies": { + "@postlight/parser": "^2.2.3", + "readability-extractor": "github:ArchiveBox/readability-extractor", + "single-file-cli": "^1.1.54", + "puppeteer": "^23.5.0", + "@puppeteer/browsers": "^2.4.0" + } +} diff --git a/etc/sonic.cfg b/etc/sonic.cfg index 10d94eaccd..0018c87c21 100644 --- a/etc/sonic.cfg +++ b/etc/sonic.cfg @@ -6,6 +6,7 @@ [server] +# log_level = "debug" log_level = "warn" diff --git a/uwsgi.ini b/etc/uwsgi.ini similarity index 91% rename from uwsgi.ini rename to etc/uwsgi.ini index 9fa83abe79..258fdb04c3 100644 --- a/uwsgi.ini +++ b/etc/uwsgi.ini @@ -2,7 +2,7 @@ socket = 127.0.0.1:3031 chdir = ../ http = 0.0.0.0:8001 -env = OUTPUT_DIR=./data +env = DATA_DIR=./data wsgi-file = archivebox/core/wsgi.py processes = 4 threads = 1 diff --git a/old/Architecture.md b/old/Architecture.md new file mode 100644 index 0000000000..098ca4415a --- /dev/null +++ b/old/Architecture.md @@ -0,0 +1,172 @@ +# ArchiveBox UI + +## Page: Getting Started + +### What do you want to capture? + +- Save some URLs now -> [Add page] + - Paste some URLs to archive now + - Upload a file containing URLs (bookmarks.html export, RSS.xml feed, markdown file, word doc, PDF, etc.) + - Pull in URLs to archive from a remote location (e.g. RSS feed URL, remote TXT file, JSON file, etc.) + +- Import URLs from a browser -> [Import page] + - Desktop: Get the ArchiveBox Chrome/Firefox extension + - Mobile: Get the ArchiveBox iOS App / Android App + - Upload a bookmarks.html export file + - Upload a browser_history.sqlite3 export file + +- Import URLs from a 3rd party bookmarking service -> [Sync page] + - Pocket + - Pinboard + - Instapaper + - Wallabag + - Zapier, N8N, IFTTT, etc. + - Upload a bookmarks.html export, bookmarks.json, RSS, etc. file + +- Archive URLs on a schedule -> [Schedule page] + +- Archive an entire website -> [Crawl page] + - What starting URL/domain? + - How deep? + - Follow links to external domains? + - Follow links to parent URLs? + - Maximum number of pages to save? + - Maximum number of requests/minute? + +- Crawl for URLs with a search engine and save automatically + - +- Some URLs on a schedule +- Save an entire website (e.g. `https://example.com`) +- Save results matching a search query (e.g. "site:example.com") +- Save a social media feed (e.g. `https://x.com/user/1234567890`) + +-------------------------------------------------------------------------------- + +### Crawls App + +- Archive an entire website -> [Crawl page] + - What are the starting URLs? + - How many hops to follow? + - Follow links to external domains? + - Follow links to parent URLs? + - Maximum number of pages to save? + - Maximum number of requests/minute? + + +-------------------------------------------------------------------------------- + +### Scheduler App + + +- Archive URLs on a schedule -> [Schedule page] + - What URL(s)? + - How often? + - Do you want to discard old snapshots after x amount of time? + - Any filter rules? + - Want to be notified when changes are detected -> redirect[Alerts app/create new alert(crawl=self)] + + +* Choose Schedule check for new URLs: Schedule.objects.get(pk=xyz) + - 1 minute + - 5 minutes + - 1 hour + - 1 day + + * Choose Destination Crawl to archive URLs using : Crawl.objects.get(pk=xyz) + - Tags + - Persona + - Created By ID + - Config + - Filters + - URL patterns to include + - URL patterns to exclude + - ONLY_NEW= Ignore URLs if already saved once / save URL each time it appears / only save is last save > x time ago + + +-------------------------------------------------------------------------------- + +### Sources App (For managing sources that ArchiveBox pulls URLs in from) + +- Add a new source to pull URLs in from (WIZARD) + - Choose URI: + - [x] Web UI + - [x] CLI + - Local filesystem path (directory to monitor for new files containing URLs) + - Remote URL (RSS/JSON/XML feed) + - Chrome browser profile sync (login using gmail to pull bookmarks/history) + - Pocket, Pinboard, Instapaper, Wallabag, etc. + - Zapier, N8N, IFTTT, etc. + - Local server filesystem path (directory to monitor for new files containing URLs) + - Google drive (directory to monitor for new files containing URLs) + - Remote server FTP/SFTP/SCP path (directory to monitor for new files containing URLs) + - AWS/S3/B2/GCP bucket (directory to monitor for new files containing URLs) + - XBrowserSync (login to pull bookmarks) + - Choose extractor + - auto + - RSS + - Pocket + - etc. + - Specify extra Config, e.g. + - credentials + - extractor tuning options (e.g. verify_ssl, cookies, etc.) + +- Provide credentials for the source + - API Key + - Username / Password + - OAuth + +-------------------------------------------------------------------------------- + +### Alerts App + +- Create a new alert, choose condition + - Get notified when a site goes down ( CrawlWorker + │ └─> Crawl.run() [state machine @started.enter] + │ └─> run_hook() for on_Crawl__* hooks + │ └─> subprocess.Popen (NOT using Process model) + │ + └─> SnapshotWorker + └─> Snapshot.run() [planned - doesn't exist yet] + └─> ArchiveResult.run() [state machine @started.enter] + └─> run_hook() for on_Snapshot__* hooks + └─> subprocess.Popen (NOT using Process model) +``` + +### Problem +1. **No Process tracking**: `run_hook()` uses `subprocess.Popen` directly, never creates Process records +2. **Orphaned Process model**: Process model has `.launch()`, `.wait()`, `.terminate()` methods that are NEVER used +3. **Manual process management**: SnapshotWorker manually uses psutil for waiting/killing +4. **Duplicate logic**: Process model and run_hook() both do subprocess management independently + +## Unified Architecture + +### Goal +Make Process model the **single source of truth** for all subprocess operations: +- Hook execution +- PID tracking +- stdout/stderr capture +- Process lifecycle (launch, wait, terminate) + +### Design + +```python +# hooks.py - Thin wrapper +def run_hook(...) -> Process: + """ + Run a hook using Process model (THIN WRAPPER). + + Returns Process model instance for tracking and control. + """ + from archivebox.machine.models import Process + + # Build command + cmd = build_hook_cmd(script, kwargs) + + # Use Process.launch() - handles everything + process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + env=build_hook_env(config), + timeout=timeout, + ) + + # Launch subprocess + process.launch(background=is_background_hook(script.name)) + + return process # Return Process, not dict + + +# worker.py - Use Process methods +class SnapshotWorker: + def _run_hook(self, hook_path, ar) -> Process: + """Fork hook using Process model.""" + process = run_hook( + hook_path, + ar.create_output_dir(), + self.snapshot.config, + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + ) + + # Link ArchiveResult to Process + ar.process = process + ar.save() + + return process + + def _wait_for_hook(self, process, ar): + """Wait using Process.wait() method.""" + exit_code = process.wait(timeout=None) + + # Update AR from hook output + ar.update_from_output() + ar.status = ar.StatusChoices.SUCCEEDED if exit_code == 0 else ar.StatusChoices.FAILED + ar.save() + + def on_shutdown(self): + """ + Terminate all background hooks in parallel with per-plugin timeouts. + + Phase 1: Send SIGTERM to all in parallel (polite request to wrap up) + Phase 2: Wait for all in parallel, respecting individual plugin timeouts + Phase 3: SIGKILL any that exceed their timeout + + Each plugin has its own timeout (SCREENSHOT_TIMEOUT=60, YTDLP_TIMEOUT=300, etc.) + Some hooks (consolelog, responses) exit immediately on SIGTERM. + Others (ytdlp, wget) need their full timeout to finish actual work. + """ + # Send SIGTERM to all processes in parallel + for hook_name, process in self.background_processes.items(): + os.kill(process.pid, signal.SIGTERM) + + # Build per-process deadlines based on plugin-specific timeouts + deadlines = { + name: (proc, time.time() + max(0, proc.timeout - (time.time() - proc.started_at.timestamp()))) + for name, proc in self.background_processes.items() + } + + # Poll all processes in parallel - no head-of-line blocking + still_running = set(deadlines.keys()) + while still_running: + time.sleep(0.1) + for name in list(still_running): + proc, deadline = deadlines[name] + if not proc.is_running(): + still_running.remove(name) + elif time.time() >= deadline: + os.kill(proc.pid, signal.SIGKILL) # Timeout exceeded + still_running.remove(name) + + +# models.py - Process becomes active +class Process: + def launch(self, background=False): + """Spawn subprocess and track it.""" + with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err: + proc = subprocess.Popen( + self.cmd, + cwd=self.pwd, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + self.pid = proc.pid + self.started_at = timezone.now() + self.status = self.StatusChoices.RUNNING + self.save() + + if not background: + # Foreground - wait inline + proc.wait() + self.exit_code = proc.returncode + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def wait(self, timeout=None): + """Wait for process to exit, polling DB.""" + while True: + self.refresh_from_db() + if self.status == self.StatusChoices.EXITED: + return self.exit_code + + # Check via psutil if Process died without updating DB + if not self.is_running(): + self._reap() # Update status from OS + return self.exit_code + + time.sleep(0.1) + + def terminate(self, sig=signal.SIGTERM): + """Gracefully terminate: SIGTERM → wait → SIGKILL.""" + if not self.is_running(): + return True + + os.kill(self.pid, sig) + + # Wait for graceful shutdown + for _ in range(50): # 5 seconds + if not self.is_running(): + self._reap() + return True + time.sleep(0.1) + + # Escalate to SIGKILL + os.kill(self.pid, signal.SIGKILL) + self._reap() + return True +``` + +## Migration Steps + +### Step 1: Update Process.launch() (DONE - already exists) +Process model already has `.launch()`, `.wait()`, `.terminate()` methods implemented in machine/models.py:1295-1593 + +### Step 2: Refactor run_hook() to use Process.launch() +**File**: `archivebox/hooks.py` + +Change signature from: +```python +def run_hook(...) -> HookResult: # Returns dict +``` + +To: +```python +def run_hook(...) -> Process: # Returns Process model +``` + +**Implementation**: +```python +def run_hook(script, output_dir, config, timeout=None, **kwargs) -> Process: + from archivebox.machine.models import Process, Machine + + # Build command + cmd = build_hook_cmd(script, kwargs) + env = build_hook_env(config) + is_bg = is_background_hook(script.name) + + # Create Process record + process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + env=env, + timeout=timeout or 120, + ) + + # Launch subprocess + process.launch(background=is_bg) + + return process +``` + +### Step 3: Update SnapshotWorker to use Process methods +**File**: `archivebox/workers/worker.py` + +Replace manual psutil code with Process model methods (shown above in Design section). + +### Step 4: Update ArchiveResult.run() to use new run_hook() +**File**: `archivebox/core/models.py:2559` + +Change from: +```python +result = run_hook(...) # Returns HookResult dict +if result is None: + is_bg_hook = True +``` + +To: +```python +process = run_hook(...) # Returns Process +self.process = process +self.save() + +if process.status == Process.StatusChoices.RUNNING: + # Background hook - still running + return +else: + # Foreground hook - completed + self.update_from_output() +``` + +### Step 5: Update Crawl.run() similarly +**File**: `archivebox/crawls/models.py:374` + +Same pattern as ArchiveResult.run() + +## Benefits + +### 1. Single Source of Truth +- Process model owns ALL subprocess operations +- No duplicate logic between run_hook(), Process, and workers +- Consistent PID tracking, stdout/stderr handling + +### 2. Proper Hierarchy +``` +Process.parent_id creates tree: +Orchestrator (PID 1000) + └─> CrawlWorker (PID 1001, parent=1000) + └─> on_Crawl__01_chrome.js (PID 1010, parent=1001) + └─> SnapshotWorker (PID 1020, parent=1000) + └─> on_Snapshot__50_wget.py (PID 1021, parent=1020) + └─> on_Snapshot__63_ytdlp.bg.py (PID 1022, parent=1020) +``` + +### 3. Better Observability +- Query all hook processes: `snapshot.process_set.all()` +- Count running: `Process.objects.filter(status='running').count()` +- Track resource usage via Process.get_memory_info() + +### 4. Cleaner Code +- SnapshotWorker._wait_for_hook: 25 lines → 8 lines +- SnapshotWorker.on_shutdown: 12 lines → 7 lines +- run_hook(): ~200 lines → ~50 lines +- Total: ~100 LoC saved + +## Risks & Mitigation + +### Risk 1: Breaking existing run_hook() callers +**Mitigation**: Two-phase rollout +1. Phase 1: Add run_hook_v2() that returns Process +2. Phase 2: Migrate callers to run_hook_v2() +3. Phase 3: Rename run_hook → run_hook_legacy, run_hook_v2 → run_hook + +### Risk 2: Background hook tracking changes +**Mitigation**: +- Process.launch(background=True) handles async launches +- Process.wait() already polls for completion +- Behavior identical to current subprocess.Popen + +### Risk 3: Performance overhead (extra DB writes) +**Mitigation**: +- Process records already being created (just not used) +- Batch updates where possible +- Monitor via metrics + +## Timeline + +### Immediate (This PR) +- [x] State machine fixes (completed) +- [x] Step advancement optimization (completed) +- [x] Document unified architecture (this file) + +### Next PR (Process Integration) +1. Add run_hook_v2() returning Process +2. Update SnapshotWorker to use Process methods +3. Migrate ArchiveResult.run() and Crawl.run() +4. Deprecate old run_hook() + +### Future +- Remove run_hook_legacy after migration complete +- Add Process.get_tree() for hierarchy visualization +- Add ProcessMachine state machine for lifecycle management diff --git a/old/TODO_archivebox_jsonl_cli.md b/old/TODO_archivebox_jsonl_cli.md new file mode 100644 index 0000000000..c421e58e72 --- /dev/null +++ b/old/TODO_archivebox_jsonl_cli.md @@ -0,0 +1,716 @@ +# ArchiveBox CLI Pipeline Architecture + +## Overview + +This plan implements a JSONL-based CLI pipeline for ArchiveBox, enabling Unix-style piping between commands: + +```bash +archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run +``` + +## Design Principles + +1. **Maximize model method reuse**: Use `.to_json()`, `.from_json()`, `.to_jsonl()`, `.from_jsonl()` everywhere +2. **Pass-through behavior**: All commands output input records + newly created records (accumulating pipeline) +3. **Create-or-update**: Commands create records if they don't exist, update if ID matches existing +4. **Auto-cascade**: `archivebox run` automatically creates Snapshots from Crawls and ArchiveResults from Snapshots +5. **Generic filtering**: Implement filters as functions that take queryset → return queryset +6. **Minimal code**: Extract duplicated `apply_filters()` to shared module + +--- + +## Real-World Use Cases + +These examples demonstrate the JSONL piping architecture. Key points: +- `archivebox run` auto-cascades (Crawl → Snapshots → ArchiveResults) +- `archivebox run` **emits JSONL** of everything it creates, enabling chained processing +- Use CLI args (`--status=`, `--plugin=`) for efficient DB filtering; use jq for transforms + +### 1. Basic Archive +```bash +# Simple URL archive (run auto-creates snapshots and archive results) +archivebox crawl create https://example.com | archivebox run + +# Multiple URLs from a file +archivebox crawl create < urls.txt | archivebox run + +# With depth crawling (follow links) +archivebox crawl create --depth=2 https://docs.python.org | archivebox run +``` + +### 2. Retry Failed Extractions +```bash +# Retry all failed extractions +archivebox archiveresult list --status=failed | archivebox run + +# Retry only failed PDFs from a specific domain +archivebox archiveresult list --status=failed --plugin=pdf --url__icontains=nytimes.com \ + | archivebox run +``` + +### 3. Import Bookmarks from Pinboard (jq transform) +```bash +# Fetch Pinboard API, transform fields to match ArchiveBox schema, archive +curl -s "https://api.pinboard.in/v1/posts/all?format=json&auth_token=$TOKEN" \ + | jq -c '.[] | {url: .href, tags_str: .tags, title: .description}' \ + | archivebox crawl create \ + | archivebox run +``` + +### 4. Retry Failed with Different Binary (jq transform + re-run) +```bash +# Get failed wget results, transform to use wget2 binary instead, re-queue as new attempts +archivebox archiveresult list --status=failed --plugin=wget \ + | jq -c '{snapshot_id, plugin, status: "queued", overrides: {WGET_BINARY: "wget2"}}' \ + | archivebox archiveresult create \ + | archivebox run + +# Chain processing: archive, then re-run any failures with increased timeout +archivebox crawl create https://slow-site.com \ + | archivebox run \ + | jq -c 'select(.type == "ArchiveResult" and .status == "failed") + | del(.id) | .status = "queued" | .overrides.TIMEOUT = "120"' \ + | archivebox archiveresult create \ + | archivebox run +``` + +### 5. Selective Extraction +```bash +# Create only screenshot extractions for queued snapshots +archivebox snapshot list --status=queued \ + | archivebox archiveresult create --plugin=screenshot \ + | archivebox run + +# Re-run singlefile on everything that was skipped +archivebox archiveresult list --plugin=singlefile --status=skipped \ + | archivebox archiveresult update --status=queued \ + | archivebox run +``` + +### 6. Bulk Tag Management +```bash +# Tag all Twitter/X URLs (efficient DB filter, no jq needed) +archivebox snapshot list --url__icontains=twitter.com \ + | archivebox snapshot update --tag=twitter + +# Tag snapshots based on computed criteria (jq for logic DB can't do) +archivebox snapshot list --status=sealed \ + | jq -c 'select(.archiveresult_count > 5) | . + {tags_str: (.tags_str + ",well-archived")}' \ + | archivebox snapshot update +``` + +### 7. RSS Feed Monitoring +```bash +# Archive all items from an RSS feed +curl -s "https://hnrss.org/frontpage" \ + | xq -r '.rss.channel.item[].link' \ + | archivebox crawl create --tag=hackernews-$(date +%Y%m%d) \ + | archivebox run +``` + +### 8. Recursive Link Following (run output → filter → re-run) +```bash +# Archive a page, then archive all PDFs it links to +archivebox crawl create https://research-papers.org/index.html \ + | archivebox run \ + | jq -c 'select(.type == "Snapshot") | .discovered_urls[]? + | select(endswith(".pdf")) | {url: .}' \ + | archivebox crawl create --tag=linked-pdfs \ + | archivebox run + +# Depth crawl with custom handling: retry timeouts with longer timeout +archivebox crawl create --depth=1 https://example.com \ + | archivebox run \ + | jq -c 'select(.type == "ArchiveResult" and .status == "failed" and .error contains "timeout") + | del(.id) | .overrides.TIMEOUT = "300"' \ + | archivebox archiveresult create \ + | archivebox run +``` + +### Composability Summary + +| Pattern | Example | +|---------|---------| +| **Filter → Process** | `list --status=failed --plugin=pdf \| run` | +| **Transform → Archive** | `curl API \| jq '{url, tags_str}' \| crawl create \| run` | +| **Retry w/ Changes** | `run \| jq 'select(.status=="failed") \| del(.id)' \| create \| run` | +| **Selective Extract** | `snapshot list \| archiveresult create --plugin=screenshot` | +| **Bulk Update** | `list --url__icontains=X \| update --tag=Y` | +| **Chain Processing** | `crawl \| run \| jq transform \| create \| run` | + +The key insight: **`archivebox run` emits JSONL of everything it creates**, enabling: +- Retry failed items with different settings (timeouts, binaries, etc.) +- Recursive crawling (archive page → extract links → archive those) +- Chained transforms (filter failures, modify config, re-queue) + +--- + +## Code Reuse Findings + +### Existing Model Methods (USE THESE) +- `Crawl.to_json()`, `Crawl.from_json()`, `Crawl.to_jsonl()`, `Crawl.from_jsonl()` +- `Snapshot.to_json()`, `Snapshot.from_json()`, `Snapshot.to_jsonl()`, `Snapshot.from_jsonl()` +- `Tag.to_json()`, `Tag.from_json()`, `Tag.to_jsonl()`, `Tag.from_jsonl()` + +### Missing Model Methods (MUST IMPLEMENT) +- **`ArchiveResult.from_json()`** - Does not exist, must be added +- **`ArchiveResult.from_jsonl()`** - Does not exist, must be added + +### Existing Utilities (USE THESE) +- `archivebox/misc/jsonl.py`: `read_stdin()`, `read_args_or_stdin()`, `write_record()`, `parse_line()` +- Type constants: `TYPE_CRAWL`, `TYPE_SNAPSHOT`, `TYPE_ARCHIVERESULT`, etc. + +### Duplicated Code (EXTRACT) +- `apply_filters()` duplicated in 7 CLI files → extract to `archivebox/cli/cli_utils.py` + +### Supervisord Config (UPDATE) +- `archivebox/workers/supervisord_util.py` line ~35: `"command": "archivebox manage orchestrator"` → `"command": "archivebox run"` + +### Field Name Standardization (FIX) +- **Issue**: `Crawl.to_json()` outputs `tags_str`, but `Snapshot.to_json()` outputs `tags` +- **Fix**: Standardize all models to use `tags_str` in JSONL output (matches model property names) + +--- + +## Implementation Order + +### Phase 1: Model Prerequisites +1. **Implement `ArchiveResult.from_json()`** in `archivebox/core/models.py` + - Pattern: Match `Snapshot.from_json()` and `Crawl.from_json()` style + - Handle: ID lookup (update existing) or create new + - Required fields: `snapshot_id`, `plugin` + - Optional fields: `status`, `hook_name`, etc. + +2. **Implement `ArchiveResult.from_jsonl()`** in `archivebox/core/models.py` + - Filter records by `type='ArchiveResult'` + - Call `from_json()` for each matching record + +3. **Fix `Snapshot.to_json()` field name** + - Change `'tags': self.tags_str()` → `'tags_str': self.tags_str()` + - Update any code that depends on `tags` key in Snapshot JSONL + +### Phase 2: Shared Utilities +4. **Extract `apply_filters()` to `archivebox/cli/cli_utils.py`** + - Generic queryset filtering from CLI kwargs + - Support `--id__in=[csv]`, `--url__icontains=str`, etc. + - Remove duplicates from 7 CLI files + +### Phase 3: Pass-Through Behavior (NEW FEATURE) +5. **Add pass-through to `archivebox crawl create`** + - Output non-Crawl input records unchanged + - Output created Crawl records + +6. **Add pass-through to `archivebox snapshot create`** + - Output non-Snapshot/non-Crawl input records unchanged + - Process Crawl records → create Snapshots + - Output both original Crawl and created Snapshots + +7. **Add pass-through to `archivebox archiveresult create`** + - Output non-Snapshot/non-ArchiveResult input records unchanged + - Process Snapshot records → create ArchiveResults + - Output both original Snapshots and created ArchiveResults + +8. **Add create-or-update to `archivebox run`** + - Records WITH id: lookup and queue existing + - Records WITHOUT id: create via `Model.from_json()`, then queue + - Pass-through output of all processed records + +### Phase 4: Test Infrastructure +9. **Create `archivebox/tests/conftest.py`** with pytest-django + - Use `pytest-django` for proper test database handling + - Isolated DATA_DIR per test via `tmp_path` fixture + - `run_archivebox_cmd()` helper for subprocess testing + +### Phase 5: Unit Tests +10. **Create `archivebox/tests/test_cli_crawl.py`** - crawl create/list/pass-through tests +11. **Create `archivebox/tests/test_cli_snapshot.py`** - snapshot create/list/pass-through tests +12. **Create `archivebox/tests/test_cli_archiveresult.py`** - archiveresult create/list/pass-through tests +13. **Create `archivebox/tests/test_cli_run.py`** - run command create-or-update tests + +### Phase 6: Integration & Config +14. **Extend `archivebox/cli/tests_piping.py`** - Add pass-through integration tests +15. **Update supervisord config** - `orchestrator` → `run` + +--- + +## Future Work (Deferred) + +### Commands to Defer +- `archivebox tag create|list|update|delete` - Already works, defer improvements +- `archivebox binary create|list|update|delete` - Lower priority +- `archivebox process list` - Lower priority +- `archivebox apikey create|list|update|delete` - Lower priority + +### `archivebox add` Relationship +- **Current**: `archivebox add` is the primary user-facing command, stays as-is +- **Future**: Refactor `add` to internally use `crawl create | snapshot create | run` pipeline +- **Note**: This refactor is deferred; `add` continues to work independently for now + +--- + +## Key Files + +| File | Action | Phase | +|------|--------|-------| +| `archivebox/core/models.py` | Add `ArchiveResult.from_json()`, `from_jsonl()` | 1 | +| `archivebox/core/models.py` | Fix `Snapshot.to_json()` → `tags_str` | 1 | +| `archivebox/cli/cli_utils.py` | NEW - shared `apply_filters()` | 2 | +| `archivebox/cli/archivebox_crawl.py` | Add pass-through to create | 3 | +| `archivebox/cli/archivebox_snapshot.py` | Add pass-through to create | 3 | +| `archivebox/cli/archivebox_archiveresult.py` | Add pass-through to create | 3 | +| `archivebox/cli/archivebox_run.py` | Add create-or-update, pass-through | 3 | +| `archivebox/tests/conftest.py` | NEW - pytest fixtures | 4 | +| `archivebox/tests/test_cli_crawl.py` | NEW - crawl unit tests | 5 | +| `archivebox/tests/test_cli_snapshot.py` | NEW - snapshot unit tests | 5 | +| `archivebox/tests/test_cli_archiveresult.py` | NEW - archiveresult unit tests | 5 | +| `archivebox/tests/test_cli_run.py` | NEW - run unit tests | 5 | +| `archivebox/cli/tests_piping.py` | Extend with pass-through tests | 6 | +| `archivebox/workers/supervisord_util.py` | Update orchestrator→run | 6 | + +--- + +## Implementation Details + +### ArchiveResult.from_json() Design + +```python +@staticmethod +def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None': + """ + Create or update a single ArchiveResult from a JSON record dict. + + Args: + record: Dict with 'snapshot_id' and 'plugin' (required for create), + or 'id' (for update) + overrides: Dict of field overrides + + Returns: + ArchiveResult instance or None if invalid + """ + from django.utils import timezone + + overrides = overrides or {} + + # If 'id' is provided, lookup and update existing + result_id = record.get('id') + if result_id: + try: + result = ArchiveResult.objects.get(id=result_id) + # Update fields from record + if record.get('status'): + result.status = record['status'] + result.retry_at = timezone.now() + result.save() + return result + except ArchiveResult.DoesNotExist: + pass # Fall through to create + + # Required fields for creation + snapshot_id = record.get('snapshot_id') + plugin = record.get('plugin') + + if not snapshot_id or not plugin: + return None + + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + return None + + # Create or get existing result + result, created = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + defaults={ + 'status': record.get('status', ArchiveResult.StatusChoices.QUEUED), + 'retry_at': timezone.now(), + 'hook_name': record.get('hook_name', ''), + **overrides, + } + ) + + # If not created, optionally reset for retry + if not created and record.get('status'): + result.status = record['status'] + result.retry_at = timezone.now() + result.save() + + return result +``` + +### Pass-Through Pattern + +All `create` commands follow this pattern: + +```python +def create_X(args, ...): + is_tty = sys.stdout.isatty() + records = list(read_args_or_stdin(args)) + + for record in records: + record_type = record.get('type') + + # Pass-through: output records we don't handle + if record_type not in HANDLED_TYPES: + if not is_tty: + write_record(record) + continue + + # Handle our type: create via Model.from_json() + obj = Model.from_json(record, overrides={...}) + + # Output created record (hydrated with db id) + if obj and not is_tty: + write_record(obj.to_json()) +``` + +### Pass-Through Semantics Example + +``` +Input: + {"type": "Crawl", "id": "abc", "urls": "https://example.com", ...} + {"type": "Tag", "name": "important"} + +archivebox snapshot create output: + {"type": "Crawl", "id": "abc", ...} # pass-through (not our type) + {"type": "Tag", "name": "important"} # pass-through (not our type) + {"type": "Snapshot", "id": "xyz", ...} # created from Crawl URLs +``` + +### Create-or-Update Pattern for `archivebox run` + +```python +def process_stdin_records() -> int: + records = list(read_stdin()) + is_tty = sys.stdout.isatty() + + for record in records: + record_type = record.get('type') + record_id = record.get('id') + + # Create-or-update based on whether ID exists + if record_type == TYPE_CRAWL: + if record_id: + try: + obj = Crawl.objects.get(id=record_id) + except Crawl.DoesNotExist: + obj = Crawl.from_json(record) + else: + obj = Crawl.from_json(record) + + if obj: + obj.retry_at = timezone.now() + obj.save() + if not is_tty: + write_record(obj.to_json()) + + # Similar for Snapshot, ArchiveResult... +``` + +### Shared apply_filters() Design + +Extract to `archivebox/cli/cli_utils.py`: + +```python +"""Shared CLI utilities for ArchiveBox commands.""" + +from typing import Optional + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """ + Apply Django-style filters from CLI kwargs to a QuerySet. + + Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2 + + Args: + queryset: Django QuerySet to filter + filter_kwargs: Dict of filter key-value pairs from CLI + limit: Optional limit on results + + Returns: + Filtered QuerySet + """ + filters = {} + for key, value in filter_kwargs.items(): + if value is None or key in ('limit', 'offset'): + continue + # Handle CSV lists for __in filters + if key.endswith('__in') and isinstance(value, str): + value = [v.strip() for v in value.split(',')] + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + if limit: + queryset = queryset[:limit] + + return queryset +``` + +--- + +## conftest.py Design (pytest-django) + +```python +"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests.""" + +import os +import sys +import json +import subprocess +from pathlib import Path +from typing import List, Dict, Any, Optional, Tuple + +import pytest + + +# ============================================================================= +# Fixtures +# ============================================================================= + +@pytest.fixture +def isolated_data_dir(tmp_path, settings): + """ + Create isolated DATA_DIR for each test. + + Uses tmp_path for isolation, configures Django settings. + """ + data_dir = tmp_path / 'archivebox_data' + data_dir.mkdir() + + # Set environment for subprocess calls + os.environ['DATA_DIR'] = str(data_dir) + + # Update Django settings + settings.DATA_DIR = data_dir + + yield data_dir + + # Cleanup handled by tmp_path fixture + + +@pytest.fixture +def initialized_archive(isolated_data_dir): + """ + Initialize ArchiveBox archive in isolated directory. + + Runs `archivebox init` to set up database and directories. + """ + from archivebox.cli.archivebox_init import init + init(setup=True, quick=True) + return isolated_data_dir + + +@pytest.fixture +def cli_env(initialized_archive): + """ + Environment dict for CLI subprocess calls. + + Includes DATA_DIR and disables slow extractors. + """ + return { + **os.environ, + 'DATA_DIR': str(initialized_archive), + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'SAVE_TITLE': 'True', + 'SAVE_FAVICON': 'False', + 'SAVE_WGET': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + } + + +# ============================================================================= +# CLI Helpers +# ============================================================================= + +def run_archivebox_cmd( + args: List[str], + stdin: Optional[str] = None, + cwd: Optional[Path] = None, + env: Optional[Dict[str, str]] = None, + timeout: int = 60, +) -> Tuple[str, str, int]: + """ + Run archivebox command, return (stdout, stderr, returncode). + + Args: + args: Command arguments (e.g., ['crawl', 'create', 'https://example.com']) + stdin: Optional string to pipe to stdin + cwd: Working directory (defaults to DATA_DIR from env) + env: Environment variables (defaults to os.environ with DATA_DIR) + timeout: Command timeout in seconds + + Returns: + Tuple of (stdout, stderr, returncode) + """ + cmd = [sys.executable, '-m', 'archivebox'] + args + + env = env or {**os.environ} + cwd = cwd or Path(env.get('DATA_DIR', '.')) + + result = subprocess.run( + cmd, + input=stdin, + capture_output=True, + text=True, + cwd=cwd, + env=env, + timeout=timeout, + ) + + return result.stdout, result.stderr, result.returncode + + +# ============================================================================= +# Output Assertions +# ============================================================================= + +def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]: + """Parse JSONL output into list of dicts.""" + records = [] + for line in stdout.strip().split('\n'): + line = line.strip() + if line and line.startswith('{'): + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1): + """Assert output contains at least min_count records of type.""" + records = parse_jsonl_output(stdout) + matching = [r for r in records if r.get('type') == record_type] + assert len(matching) >= min_count, \ + f"Expected >= {min_count} {record_type}, got {len(matching)}" + return matching + + +def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]): + """Assert that input records appear in output (pass-through behavior).""" + output_records = parse_jsonl_output(stdout) + output_ids = {r.get('id') for r in output_records if r.get('id')} + + for input_rec in input_records: + input_id = input_rec.get('id') + if input_id: + assert input_id in output_ids, \ + f"Input record {input_id} not found in output (pass-through failed)" + + +def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]): + """Assert record has all required fields with non-None values.""" + for field in required_fields: + assert field in record, f"Record missing field: {field}" + assert record[field] is not None, f"Record field is None: {field}" + + +# ============================================================================= +# Database Assertions +# ============================================================================= + +def assert_db_count(model_class, filters: Dict[str, Any], expected: int): + """Assert database count matches expected.""" + actual = model_class.objects.filter(**filters).count() + assert actual == expected, \ + f"Expected {expected} {model_class.__name__}, got {actual}" + + +def assert_db_exists(model_class, **filters): + """Assert at least one record exists matching filters.""" + assert model_class.objects.filter(**filters).exists(), \ + f"No {model_class.__name__} found matching {filters}" + + +# ============================================================================= +# Test Data Factories +# ============================================================================= + +def create_test_url(domain: str = 'example.com', path: str = None) -> str: + """Generate unique test URL.""" + import uuid + path = path or uuid.uuid4().hex[:8] + return f'https://{domain}/{path}' + + +def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]: + """Create Crawl JSONL record for testing.""" + from archivebox.misc.jsonl import TYPE_CRAWL + + urls = urls or [create_test_url()] + return { + 'type': TYPE_CRAWL, + 'urls': '\n'.join(urls), + 'max_depth': kwargs.get('max_depth', 0), + 'tags_str': kwargs.get('tags_str', ''), + 'status': kwargs.get('status', 'queued'), + **{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')}, + } + + +def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]: + """Create Snapshot JSONL record for testing.""" + from archivebox.misc.jsonl import TYPE_SNAPSHOT + + return { + 'type': TYPE_SNAPSHOT, + 'url': url or create_test_url(), + 'tags_str': kwargs.get('tags_str', ''), + 'status': kwargs.get('status', 'queued'), + **{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')}, + } +``` + +--- + +## Test Rules + +- **NO SKIPPING** - Every test runs +- **NO MOCKING** - Real subprocess calls, real database +- **NO DISABLING** - Failing tests identify real problems +- **MINIMAL CODE** - Import helpers from conftest.py +- **ISOLATED** - Each test gets its own DATA_DIR via `tmp_path` + +--- + +## Task Checklist + +### Phase 1: Model Prerequisites +- [x] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py` +- [x] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py` +- [x] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags` + +### Phase 2: Shared Utilities +- [x] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()` +- [x] Update 7 CLI files to import from `cli_utils.py` + +### Phase 3: Pass-Through Behavior +- [x] Add pass-through to `archivebox_crawl.py` create +- [x] Add pass-through to `archivebox_snapshot.py` create +- [x] Add pass-through to `archivebox_archiveresult.py` create +- [x] Add create-or-update to `archivebox_run.py` +- [x] Add pass-through output to `archivebox_run.py` + +### Phase 4: Test Infrastructure +- [x] Create `archivebox/tests/conftest.py` with pytest-django fixtures + +### Phase 5: Unit Tests +- [x] Create `archivebox/tests/test_cli_crawl.py` +- [x] Create `archivebox/tests/test_cli_snapshot.py` +- [x] Create `archivebox/tests/test_cli_archiveresult.py` +- [x] Create `archivebox/tests/test_cli_run.py` + +### Phase 6: Integration & Config +- [x] Extend `archivebox/cli/tests_piping.py` with pass-through tests +- [x] Update `archivebox/workers/supervisord_util.py`: orchestrator→run diff --git a/old/TODO_chrome_plugin_cleanup.md b/old/TODO_chrome_plugin_cleanup.md new file mode 100644 index 0000000000..90b7716f5f --- /dev/null +++ b/old/TODO_chrome_plugin_cleanup.md @@ -0,0 +1,431 @@ +# Chrome Plugin Consolidation - COMPLETED ✓ + +## Core Principle: One ArchiveResult Per Plugin + +**Critical Realization:** Each plugin must produce exactly ONE ArchiveResult output. This is fundamental to ArchiveBox's architecture - you cannot have multiple outputs from a single plugin. + +### CRITICAL ARCHITECTURE CLARIFICATION + +**DO NOT CONFUSE THESE CONCEPTS:** + +1. **Plugin** = Directory name (e.g., `chrome`, `consolelog`, `screenshot`) + - Lives in `archivebox/plugins//` + - Can contain MULTIPLE hook files + - Produces ONE output directory: `users/{username}/snapshots/YYYYMMDD/{domain}/{snap_id}/{plugin_name}/` + - Creates ONE ArchiveResult record per snapshot + +2. **Hook** = Individual script file (e.g., `on_Snapshot__20_chrome_tab.bg.js`) + - Lives inside a plugin directory + - One plugin can have MANY hooks + - All hooks in a plugin run sequentially when that plugin's ArchiveResult is processed + - All hooks write to the SAME output directory (the plugin directory) + +3. **Extractor** = ArchiveResult.extractor field = PLUGIN NAME (not hook name) + - `ArchiveResult.extractor = 'chrome'` (plugin name) + - NOT `ArchiveResult.extractor = '20_chrome_tab.bg'` (hook name) + +4. **Output Directory** = `users/{username}/snapshots/YYYYMMDD/{domain}/{snap_id}/{plugin_name}/` + - One output directory per plugin (0.9.x structure) + - ALL hooks in that plugin write to this same directory + - Example: `users/default/snapshots/20251227/example.com/019b-6397-6a5b/chrome/` contains outputs from ALL chrome hooks + - Legacy: `archive/{timestamp}/` with symlink for backwards compatibility + +**Example 1: Chrome Plugin (Infrastructure - NO ArchiveResult)** +``` +Plugin name: 'chrome' +ArchiveResult: NONE (infrastructure only) +Output directory: users/default/snapshots/20251227/example.com/019b-6397-6a5b/chrome/ + +Hooks: + - on_Snapshot__20_chrome_tab.bg.js # Launches Chrome, opens tab + - on_Snapshot__30_chrome_navigate.js # Navigates to URL + - on_Snapshot__45_chrome_tab_cleanup.py # Kills Chrome on cleanup + +Writes (temporary infrastructure files, deleted on cleanup): + - chrome/cdp_url.txt # Other plugins read this to connect + - chrome/target_id.txt # Tab ID for CDP connection + - chrome/page_loaded.txt # Navigation completion marker + - chrome/navigation.json # Navigation state + - chrome/hook.pid # For cleanup + +NO ArchiveResult JSON is produced - this is pure infrastructure. +On SIGTERM: Chrome exits, chrome/ directory is deleted. +``` + +**Example 2: Screenshot Plugin (Output Plugin - CREATES ArchiveResult)** +``` +Plugin name: 'screenshot' +ArchiveResult.extractor: 'screenshot' +Output directory: users/default/snapshots/20251227/example.com/019b-6397-6a5b/screenshot/ + +Hooks: + - on_Snapshot__34_screenshot.js + +Process: + 1. Reads ../chrome/cdp_url.txt to get Chrome connection + 2. Connects to Chrome CDP + 3. Takes screenshot + 4. Writes to: screenshot/screenshot.png + 5. Emits ArchiveResult JSON to stdout + +Creates ArchiveResult with status=succeeded, output_files={'screenshot.png': {}} +``` + +**Example 3: PDF Plugin (Output Plugin - CREATES ArchiveResult)** +``` +Plugin name: 'pdf' +ArchiveResult.extractor: 'pdf' +Output directory: users/default/snapshots/20251227/example.com/019b-6397-6a5b/pdf/ + +Hooks: + - on_Snapshot__35_pdf.js + +Process: + 1. Reads ../chrome/cdp_url.txt to get Chrome connection + 2. Connects to Chrome CDP + 3. Generates PDF + 4. Writes to: pdf/output.pdf + 5. Emits ArchiveResult JSON to stdout + +Creates ArchiveResult with status=succeeded, output_files={'output.pdf': {}} +``` + +**Lifecycle:** +``` +1. Chrome hooks run → create chrome/ dir with infrastructure files +2. Screenshot/PDF/etc hooks run → read chrome/cdp_url.txt, write to their own dirs +3. Snapshot.cleanup() called → sends SIGTERM to background hooks +4. Chrome receives SIGTERM → exits, deletes chrome/ dir +5. Screenshot/PDF/etc dirs remain with their outputs +``` + +**DO NOT:** +- Create one ArchiveResult per hook +- Use hook names as extractor values +- Create separate output directories per hook + +**DO:** +- Create one ArchiveResult per plugin +- Use plugin directory name as extractor value +- Run all hooks in a plugin when processing its ArchiveResult +- Write all hook outputs to the same plugin directory + +This principle drove the entire consolidation strategy: +- **Chrome plugin** = Infrastructure only (NO ArchiveResult) +- **Output plugins** = Each produces ONE distinct ArchiveResult (kept separate) + +## Final Structure + +### 1. Chrome Plugin (Infrastructure - No Output) + +**Location:** `archivebox/plugins/chrome/` + +This plugin provides shared Chrome infrastructure for other plugins. It manages the browser lifecycle but **produces NO ArchiveResult** - only infrastructure files in a single `chrome/` output directory. + +**Consolidates these former plugins:** +- `chrome_session/` → Merged +- `chrome_navigate/` → Merged +- `chrome_cleanup/` → Merged +- `chrome_extensions/` → Utilities merged + +**Hook Files:** +``` +chrome/ +├── on_Crawl__00_chrome_install_config.py # Configure Chrome settings +├── on_Crawl__00_chrome_install.py # Install Chrome binary +├── on_Crawl__30_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg) +├── on_Snapshot__20_chrome_tab.bg.js # Open tab (Snapshot-level, bg) +├── on_Snapshot__30_chrome_navigate.js # Navigate to URL (foreground) +├── on_Snapshot__45_chrome_tab_cleanup.py # Close tab, kill bg hooks +├── chrome_extension_utils.js # Extension utilities +├── config.json # Configuration +└── tests/test_chrome.py # Tests +``` + +**Output Directory (Infrastructure Only):** +``` +chrome/ +├── cdp_url.txt # WebSocket URL for CDP connection +├── pid.txt # Chrome process PID +├── target_id.txt # Current tab target ID +├── page_loaded.txt # Navigation completion marker +├── final_url.txt # Final URL after redirects +├── navigation.json # Navigation state (NEW) +└── hook.pid # Background hook PIDs (for cleanup) +``` + +**New: navigation.json** + +Tracks navigation state with wait condition and timing: +```json +{ + "waitUntil": "networkidle2", + "elapsed": 1523, + "url": "https://example.com", + "finalUrl": "https://example.com/", + "status": 200, + "timestamp": "2025-12-27T22:15:30.123Z" +} +``` + +Fields: +- `waitUntil` - Wait condition: `networkidle0`, `networkidle2`, `domcontentloaded`, or `load` +- `elapsed` - Navigation time in milliseconds +- `url` - Original requested URL +- `finalUrl` - Final URL after redirects (success only) +- `status` - HTTP status code (success only) +- `error` - Error message (failure only) +- `timestamp` - ISO 8601 completion timestamp + +### 2. Output Plugins (Each = One ArchiveResult) + +These remain **SEPARATE** plugins because each produces a distinct output/ArchiveResult. Each plugin references `../chrome` for infrastructure. + +#### consolelog Plugin +``` +archivebox/plugins/consolelog/ +└── on_Snapshot__21_consolelog.bg.js +``` +- **Output:** `console.jsonl` (browser console messages) +- **Type:** Background hook (CDP listener) +- **References:** `../chrome` for CDP URL + +#### ssl Plugin +``` +archivebox/plugins/ssl/ +└── on_Snapshot__23_ssl.bg.js +``` +- **Output:** `ssl.jsonl` (SSL/TLS certificate details) +- **Type:** Background hook (CDP listener) +- **References:** `../chrome` for CDP URL + +#### responses Plugin +``` +archivebox/plugins/responses/ +└── on_Snapshot__24_responses.bg.js +``` +- **Output:** `responses/` directory with `index.jsonl` (network responses) +- **Type:** Background hook (CDP listener) +- **References:** `../chrome` for CDP URL + +#### redirects Plugin +``` +archivebox/plugins/redirects/ +└── on_Snapshot__31_redirects.bg.js +``` +- **Output:** `redirects.jsonl` (redirect chain) +- **Type:** Background hook (CDP listener) +- **References:** `../chrome` for CDP URL +- **Changed:** Converted to background hook, now uses CDP `Network.requestWillBeSent` to capture redirects from initial request + +#### staticfile Plugin +``` +archivebox/plugins/staticfile/ +└── on_Snapshot__31_staticfile.bg.js +``` +- **Output:** Downloaded static file (PDF, image, video, etc.) +- **Type:** Background hook (CDP listener) +- **References:** `../chrome` for CDP URL +- **Changed:** Converted from Python to JavaScript, now uses CDP to detect Content-Type from initial response and download via CDP + +## What Changed + +### 1. Plugin Consolidation +- Merged `chrome_session`, `chrome_navigate`, `chrome_cleanup`, `chrome_extensions` → `chrome/` +- Chrome plugin now has **single output directory**: `chrome/` +- All Chrome infrastructure hooks reference `.` (same directory) + +### 2. Background Hook Conversions + +**redirects Plugin:** +- **Before:** Ran AFTER navigation, reconnected to Chrome to check for redirects +- **After:** Background hook that sets up CDP listeners BEFORE navigation to capture redirects from initial request +- **Method:** Uses CDP `Network.requestWillBeSent` event with `redirectResponse` parameter + +**staticfile Plugin:** +- **Before:** Python script that ran AFTER navigation, checked response headers +- **After:** Background JavaScript hook that sets up CDP listeners BEFORE navigation +- **Method:** Uses CDP `page.on('response')` to capture Content-Type from initial request +- **Language:** Converted from Python to JavaScript/Node.js for consistency + +### 3. Navigation State Tracking +- **Added:** `navigation.json` file in `chrome/` output directory +- **Contains:** `waitUntil` condition and `elapsed` milliseconds +- **Purpose:** Track navigation performance and wait conditions for analysis + +### 4. Cleanup +- **Deleted:** `chrome_session/on_CrawlEnd__99_chrome_cleanup.py` (manual cleanup hook) +- **Reason:** Automatic cleanup via state machines is sufficient +- **Verified:** Cleanup mechanisms in `core/models.py` and `crawls/models.py` work correctly + +## Hook Execution Order + +``` +═══ CRAWL LEVEL ═══ + 00. chrome_install_config.py Configure Chrome settings + 00. chrome_install.py Install Chrome binary + 20. chrome_launch.bg.js Launch Chrome browser (STAYS RUNNING) + +═══ PER-SNAPSHOT LEVEL ═══ + +Phase 1: PRE-NAVIGATION (Background hooks setup) + 20. chrome_tab.bg.js Open new tab (STAYS ALIVE) + 21. consolelog.bg.js Setup console listener (STAYS ALIVE) + 23. ssl.bg.js Setup SSL listener (STAYS ALIVE) + 24. responses.bg.js Setup network response listener (STAYS ALIVE) + 31. redirects.bg.js Setup redirect listener (STAYS ALIVE) + 31. staticfile.bg.js Setup staticfile detector (STAYS ALIVE) + +Phase 2: NAVIGATION (Foreground - synchronization point) + 30. chrome_navigate.js Navigate to URL (BLOCKS until page loaded) + ↓ + Writes navigation.json with waitUntil & elapsed + Writes page_loaded.txt marker + ↓ + All background hooks can now finalize + +Phase 3: POST-NAVIGATION (Background hooks finalize) + (All .bg hooks save their data and wait for cleanup signal) + +Phase 4: OTHER EXTRACTORS (use loaded page) + 34. screenshot.js + 37. singlefile.js + ... (other extractors that need loaded page) + +Phase 5: CLEANUP + 45. chrome_tab_cleanup.py Close tab + Kill background hooks (SIGTERM → SIGKILL) + Update ArchiveResults +``` + +## Background Hook Pattern + +All `.bg.js` hooks follow this pattern: + +1. **Setup:** Create CDP listeners BEFORE navigation +2. **Capture:** Collect data incrementally as events occur +3. **Write:** Save data to filesystem continuously +4. **Wait:** Keep process alive until SIGTERM +5. **Finalize:** On SIGTERM, emit final JSONL result to stdout +6. **Exit:** Clean exit with status code + +**Key files written:** +- `hook.pid` - Process ID for cleanup mechanism +- Output files (e.g., `console.jsonl`, `ssl.jsonl`, etc.) + +## Automatic Cleanup Mechanism + +**Snapshot-level cleanup** (`core/models.py`): +```python +def cleanup(self): + """Kill background hooks and close resources.""" + # Scan OUTPUT_DIR for hook.pid files + # Send SIGTERM to processes + # Wait for graceful exit + # Send SIGKILL if process still alive + # Update ArchiveResults to FAILED if needed +``` + +**Crawl-level cleanup** (`crawls/models.py`): +```python +def cleanup(self): + """Kill Crawl-level background hooks (Chrome browser).""" + # Similar pattern for Crawl-level resources + # Kills Chrome launch process +``` + +**State machine integration:** +- Both `SnapshotMachine` and `CrawlMachine` call `cleanup()` when entering `sealed` state +- Ensures all background processes are cleaned up properly +- No manual cleanup hooks needed + +## Directory References + +**Crawl output structure:** +- Crawls output to: `users/{user_id}/crawls/{YYYYMMDD}/{crawl_id}/` +- Example: `users/1/crawls/20251227/abc-def-123/` +- Crawl-level plugins create subdirectories: `users/1/crawls/20251227/abc-def-123/chrome/` + +**Snapshot output structure:** +- Snapshots output to: `archive/{timestamp}/` +- Snapshot-level plugins create subdirectories: `archive/{timestamp}/chrome/`, `archive/{timestamp}/consolelog/`, etc. + +**Within chrome plugin:** +- Hooks use `.` or `OUTPUT_DIR` to reference the `chrome/` directory they're running in +- Example: `fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), ...)` + +**From output plugins to chrome (same snapshot):** +- Hooks use `../chrome` to reference Chrome infrastructure in same snapshot +- Example: `const CHROME_SESSION_DIR = '../chrome';` +- Used to read: `cdp_url.txt`, `target_id.txt`, `page_loaded.txt` + +**From snapshot hooks to crawl chrome:** +- Snapshot hooks receive `CRAWL_OUTPUT_DIR` environment variable (set by hooks.py) +- Use: `path.join(process.env.CRAWL_OUTPUT_DIR, 'chrome')` to find crawl-level Chrome +- This allows snapshots to reuse the crawl's shared Chrome browser + +**Navigation synchronization:** +- All hooks wait for `../chrome/page_loaded.txt` before finalizing +- This file is written by `chrome_navigate.js` after navigation completes + +## Design Principles + +1. **One ArchiveResult Per Plugin** + - Each plugin produces exactly ONE output/ArchiveResult + - Infrastructure plugins (like chrome) produce NO ArchiveResult + +2. **Chrome as Infrastructure** + - Provides shared CDP connection, PIDs, navigation state + - No ArchiveResult output of its own + - Single output directory for all infrastructure files + +3. **Background Hooks for CDP** + - Hooks that need CDP listeners BEFORE navigation are background (`.bg.js`) + - They capture events from the initial request/response + - Stay alive through navigation and cleanup + +4. **Foreground for Synchronization** + - `chrome_navigate.js` is foreground (not `.bg`) + - Provides synchronization point - blocks until page loaded + - All other hooks wait for its completion marker + +5. **Automatic Cleanup** + - State machines handle background hook cleanup + - No manual cleanup hooks needed + - SIGTERM for graceful exit, SIGKILL as backup + +6. **Clear Separation** + - Infrastructure vs outputs + - One output directory per plugin + - Predictable, maintainable architecture + +## Benefits + +✓ **Architectural Clarity** - Clear separation between infrastructure and outputs +✓ **Correct Output Model** - One ArchiveResult per plugin +✓ **Better Performance** - CDP listeners capture data from initial request +✓ **No Duplication** - Single Chrome infrastructure used by all +✓ **Proper Lifecycle** - Background hooks cleaned up automatically +✓ **Maintainable** - Easy to understand, debug, and extend +✓ **Consistent** - All background hooks follow same pattern +✓ **Observable** - Navigation state tracked for debugging + +## Testing + +Run tests: +```bash +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/plugins/chrome/tests/ -v' +``` + +## Migration Notes + +**For developers:** +- Chrome infrastructure is now in `chrome/` output dir (not `chrome_session/`) +- Reference `../chrome/cdp_url.txt` from output plugins +- Navigation marker is `../chrome/page_loaded.txt` +- Navigation details in `../chrome/navigation.json` + +**For users:** +- No user-facing changes +- Output structure remains the same +- All extractors continue to work diff --git a/old/TODO_cli_refactor.md b/old/TODO_cli_refactor.md new file mode 100644 index 0000000000..0ce5e09288 --- /dev/null +++ b/old/TODO_cli_refactor.md @@ -0,0 +1,131 @@ +# ArchiveBox CLI Refactor TODO + +## Design Decisions + +1. **Keep `archivebox add`** as high-level convenience command +2. **Unified `archivebox run`** for processing (replaces per-model `run` and `orchestrator`) +3. **Expose all models** including binary, process, machine +4. **Clean break** from old command structure (no backward compatibility aliases) + +## Final Architecture + +``` +archivebox [args...] [--filters] +archivebox run [stdin JSONL] +``` + +### Actions (4 per model): +- `create` - Create records (from args, stdin, or JSONL), dedupes by indexed fields +- `list` - Query records (with filters, returns JSONL) +- `update` - Modify records (from stdin JSONL, PATCH semantics) +- `delete` - Remove records (from stdin JSONL, requires --yes) + +### Unified Run Command: +- `archivebox run` - Process queued work + - With stdin JSONL: Process piped records, exit when complete + - Without stdin (TTY): Run orchestrator in foreground until killed + +### Models (7 total): +- `crawl` - Crawl jobs +- `snapshot` - Individual archived pages +- `archiveresult` - Plugin extraction results +- `tag` - Tags/labels +- `binary` - Detected binaries (chrome, wget, etc.) +- `process` - Process execution records (read-only) +- `machine` - Machine/host records (read-only) + +--- + +## Implementation Checklist + +### Phase 1: Unified Run Command +- [x] Create `archivebox/cli/archivebox_run.py` - unified processing command + +### Phase 2: Core Model Commands +- [x] Refactor `archivebox/cli/archivebox_snapshot.py` to Click group with create|list|update|delete +- [x] Refactor `archivebox/cli/archivebox_crawl.py` to Click group with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_archiveresult.py` with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_tag.py` with create|list|update|delete + +### Phase 3: System Model Commands +- [x] Create `archivebox/cli/archivebox_binary.py` with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_process.py` with list only (read-only) +- [x] Create `archivebox/cli/archivebox_machine.py` with list only (read-only) + +### Phase 4: Registry & Cleanup +- [x] Update `archivebox/cli/__init__.py` command registry +- [x] Delete `archivebox/cli/archivebox_extract.py` +- [x] Delete `archivebox/cli/archivebox_remove.py` +- [x] Delete `archivebox/cli/archivebox_search.py` +- [x] Delete `archivebox/cli/archivebox_orchestrator.py` +- [x] Update `archivebox/cli/archivebox_add.py` internals (no changes needed - uses models directly) +- [x] Update `archivebox/cli/tests_piping.py` + +### Phase 5: Tests for New Commands +- [ ] Add tests for `archivebox run` command +- [ ] Add tests for `archivebox crawl create|list|update|delete` +- [ ] Add tests for `archivebox snapshot create|list|update|delete` +- [ ] Add tests for `archivebox archiveresult create|list|update|delete` +- [ ] Add tests for `archivebox tag create|list|update|delete` +- [ ] Add tests for `archivebox binary create|list|update|delete` +- [ ] Add tests for `archivebox process list` +- [ ] Add tests for `archivebox machine list` + +--- + +## Usage Examples + +### Basic CRUD +```bash +# Create +archivebox crawl create https://example.com https://foo.com --depth=1 +archivebox snapshot create https://example.com --tag=news + +# List with filters +archivebox crawl list --status=queued +archivebox snapshot list --url__icontains=example.com +archivebox archiveresult list --status=failed --plugin=screenshot + +# Update (reads JSONL from stdin, applies changes) +archivebox snapshot list --tag=old | archivebox snapshot update --tag=new + +# Delete (requires --yes) +archivebox crawl list --url__icontains=example.com | archivebox crawl delete --yes +``` + +### Unified Run Command +```bash +# Run orchestrator in foreground (replaces `archivebox orchestrator`) +archivebox run + +# Process specific records (pipe any JSONL type, exits when done) +archivebox snapshot list --status=queued | archivebox run +archivebox archiveresult list --status=failed | archivebox run +archivebox crawl list --status=queued | archivebox run + +# Mixed types work too - run handles any JSONL +cat mixed_records.jsonl | archivebox run +``` + +### Composed Workflows +```bash +# Full pipeline (replaces old `archivebox add`) +archivebox crawl create https://example.com --status=queued \ + | archivebox snapshot create --status=queued \ + | archivebox archiveresult create --status=queued \ + | archivebox run + +# Re-run failed extractions +archivebox archiveresult list --status=failed | archivebox run + +# Delete all snapshots for a domain +archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes +``` + +### Keep `archivebox add` as convenience +```bash +# This remains the simple user-friendly interface: +archivebox add https://example.com --depth=1 --tag=news + +# Internally equivalent to the composed pipeline above +``` diff --git a/old/TODO_fix_migration_path.md b/old/TODO_fix_migration_path.md new file mode 100644 index 0000000000..4bd25e5eea --- /dev/null +++ b/old/TODO_fix_migration_path.md @@ -0,0 +1,427 @@ +# TODO: Fix Migration Path for v0.7.2/v0.8.6rc0 → v0.9.0 + +## Critical Issue + +The migrations currently **LOSE DATA** during the v0.7.2 → v0.9.0 upgrade: +- `extractor` field data is not being copied to `plugin` field +- `output` field data is not being copied to `output_str` field +- Timestamp fields (`added`, `updated`) may not be properly transformed +- Tag UUID → INTEGER conversion may lose FK relationships + +## Test Database Locations + +Sample databases for testing are available at: +``` +/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data/index.sqlite3 +/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.8.6rc0/data/index.sqlite3 +``` + +Schema comparison reports: +``` +/tmp/schema_comparison_report.md +/tmp/table_presence_matrix.md +``` + +## How to Test Migrations + +### 1. Fresh Install Test +```bash +rm -rf /tmp/test_fresh && mkdir -p /tmp/test_fresh +DATA_DIR=/tmp/test_fresh python -m archivebox init +DATA_DIR=/tmp/test_fresh python -m archivebox status +``` + +### 2. v0.7.2 Migration Test +```bash +rm -rf /tmp/test_v072 && mkdir -p /tmp/test_v072 +cp /Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data/index.sqlite3 /tmp/test_v072/ +DATA_DIR=/tmp/test_v072 python -m archivebox init +DATA_DIR=/tmp/test_v072 python -m archivebox status +``` + +### 3. v0.8.6rc0 Migration Test +```bash +rm -rf /tmp/test_v086 && mkdir -p /tmp/test_v086 +cp /Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.8.6rc0/data/index.sqlite3 /tmp/test_v086/ +DATA_DIR=/tmp/test_v086 python -m archivebox init +DATA_DIR=/tmp/test_v086 python -m archivebox status +``` + +### 4. Verify Data Integrity + +After each test, compare original vs migrated data: + +```bash +# Check ArchiveResult data preservation +echo "=== ORIGINAL ===" +sqlite3 /path/to/original.db "SELECT id, extractor, output, status FROM core_archiveresult LIMIT 5;" + +echo "=== MIGRATED ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT id, plugin, output_str, status FROM core_archiveresult LIMIT 5;" + +# Check Snapshot data preservation +echo "=== ORIGINAL SNAPSHOTS ===" +sqlite3 /path/to/original.db "SELECT id, url, title, added, updated FROM core_snapshot LIMIT 5;" + +echo "=== MIGRATED SNAPSHOTS ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT id, url, title, bookmarked_at, created_at, modified_at FROM core_snapshot LIMIT 5;" + +# Check Tag data preservation +echo "=== ORIGINAL TAGS ===" +sqlite3 /path/to/original.db "SELECT * FROM core_tag;" + +echo "=== MIGRATED TAGS ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT * FROM core_tag;" + +# Check snapshot-tag relationships +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT COUNT(*) FROM core_snapshot_tags;" +``` + +**CRITICAL**: Verify: +- Row counts match +- All URLs, titles, timestamps are preserved +- All extractor values are copied to plugin field +- All output values are copied to output_str field +- All tag relationships are maintained (tag IDs should be converted from UUID to INTEGER for v0.8.6) + +## Migration Philosophy + +### Principle: Minimal Manual SQL + +Use this approach for complex migrations: + +1. **Python**: Detect existing schema version + ```python + def get_table_columns(table_name): + cursor = connection.cursor() + cursor.execute(f"PRAGMA table_info({table_name})") + return {row[1] for row in cursor.fetchall()} + + cols = get_table_columns('core_archiveresult') + has_extractor = 'extractor' in cols + has_plugin = 'plugin' in cols + ``` + +2. **SQL**: Modify database structure during migration + ```sql + CREATE TABLE core_archiveresult_new (...); + INSERT INTO core_archiveresult_new SELECT ... FROM core_archiveresult; + DROP TABLE core_archiveresult; + ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult; + ``` + +3. **Python**: Copy data between old and new field names + ```python + if 'extractor' in cols and 'plugin' in cols: + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '')") + ``` + +4. **SQL**: Drop old columns/tables + ```sql + -- Django's RemoveField will handle this + ``` + +5. **Django**: Register the end state so Django knows what the schema should be + ```python + migrations.SeparateDatabaseAndState( + database_operations=[...], # Your SQL/Python migrations + state_operations=[...] # Tell Django what the final schema looks like + ) + ``` + +### Key Files + +- **core/migrations/0023_upgrade_to_0_9_0.py**: Raw SQL migration that upgrades tables from v0.7.2/v0.8.6 schema + - Should create NEW tables with OLD field names (extractor, output, added, updated) + - Should preserve ALL data during table rebuild + - Should NOT add new fields yet (let Django migrations handle that) + +- **core/migrations/0025_alter_archiveresult_options_...py**: Django-generated migration + - Adds new fields (plugin, output_str, bookmarked_at, created_at, etc.) + - Should include RunPython to copy data from old fields to new fields AFTER AddField operations + - RemoveField operations to remove old columns + +- **crawls/migrations/0002_upgrade_from_0_8_6.py**: Handles crawls_crawl table upgrade + - v0.8.6 has `seed_id` + `persona` (VARCHAR) + - v0.9.0 has `urls` + `persona_id` (UUID FK) + +## How to Make vs Apply Migrations + +### Making Migrations (Creating New Migrations) + +**Always run from the archivebox/ subdirectory** (NOT from a data dir): + +```bash +cd archivebox/ +./manage.py makemigrations +./manage.py makemigrations --check # Verify no unreflected changes +``` + +This works because `archivebox/manage.py` has: +```python +os.environ.setdefault('ARCHIVEBOX_DATA_DIR', '.') +``` + +### Applying Migrations (Testing Migrations) + +**Always run from inside a data directory** using `archivebox init`: + +```bash +# WRONG - Don't do this: +cd /some/data/dir +../path/to/archivebox/manage.py migrate + +# RIGHT - Do this: +DATA_DIR=/some/data/dir python -m archivebox init +``` + +Why? Because `archivebox init`: +- Sets up the data directory structure +- Runs migrations with proper DATA_DIR context +- Creates necessary files and folders +- Validates the installation + +## Schema Version Differences + +### v0.7.2 Schema (Migration 0022) +- **ArchiveResult**: `id` (INTEGER), `uuid`, `extractor`, `output`, `cmd`, `pwd`, `cmd_version`, `start_ts`, `end_ts`, `status`, `snapshot_id` +- **Snapshot**: `id`, `url`, `timestamp`, `title`, `added`, `updated`, `crawl_id` +- **Tag**: `id` (INTEGER), `name`, `slug` +- **Crawl**: Doesn't exist in v0.7.2 + +### v0.8.6rc0 Schema +- **ArchiveResult**: `id`, `abid` (not uuid!), `extractor`, `output`, `created_at`, `modified_at`, `retry_at`, `status`, ... +- **Snapshot**: `id`, `url`, `bookmarked_at`, `created_at`, `modified_at`, `crawl_id`, `status`, `retry_at`, ... +- **Tag**: `id` (UUID/CHAR!), `name`, `slug`, `abid`, `created_at`, `modified_at`, `created_by_id` +- **Crawl**: `id`, `seed_id`, `persona` (VARCHAR), `max_depth`, `tags_str`, `status`, `retry_at`, ... + +### v0.9.0 Target Schema +- **ArchiveResult**: `id` (INTEGER), `uuid`, `plugin` (not extractor!), `output_str` (not output!), `hook_name`, `created_at`, `modified_at`, `output_files`, `output_json`, `output_size`, `output_mimetypes`, `retry_at`, ... +- **Snapshot**: `id`, `url`, `bookmarked_at` (not added!), `created_at`, `modified_at` (not updated!), `crawl_id`, `parent_snapshot_id`, `status`, `retry_at`, `current_step`, `depth`, `fs_version`, ... +- **Tag**: `id` (INTEGER!), `name`, `slug`, `created_at`, `modified_at`, `created_by_id` +- **Crawl**: `id`, `urls` (not seed_id!), `persona_id` (not persona!), `label`, `notes`, `output_dir`, ... + +## Critical Gotchas and Mistakes to Avoid + +### 1. ❌ DON'T Create New Fields in SQL Migration (0023) + +**WRONG**: +```python +# In core/migrations/0023_upgrade_to_0_9_0.py +cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id INTEGER PRIMARY KEY, + plugin VARCHAR(32), # ❌ New field! + output_str TEXT, # ❌ New field! + ... + ) +""") +``` + +**RIGHT**: +```python +# In core/migrations/0023_upgrade_to_0_9_0.py - Keep OLD field names! +cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id INTEGER PRIMARY KEY, + extractor VARCHAR(32), # ✓ OLD field name + output VARCHAR(1024), # ✓ OLD field name + ... + ) +""") +``` + +**Why**: If you create new fields in SQL, Django's AddField operation in migration 0025 will overwrite them with default values, losing your data! + +### 2. ❌ DON'T Copy Data in SQL Migration + +**WRONG**: +```python +# In core/migrations/0023 +cursor.execute(""" + INSERT INTO core_archiveresult_new (plugin, output_str, ...) + SELECT COALESCE(extractor, ''), COALESCE(output, ''), ... + FROM core_archiveresult +""") +``` + +**RIGHT**: Keep old field names in SQL, let Django AddField create new columns, then copy: +```python +# In core/migrations/0025 (AFTER AddField operations) +def copy_old_to_new(apps, schema_editor): + cursor = connection.cursor() + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '')") + cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '')") +``` + +### 3. ❌ DON'T Assume Empty Tables Mean Fresh Install + +**WRONG**: +```python +cursor.execute("SELECT COUNT(*) FROM core_archiveresult") +if cursor.fetchone()[0] == 0: + return # Skip migration +``` + +**Why**: Fresh installs run migrations 0001-0022 which CREATE empty tables with old schema. Migration 0023 must still upgrade the schema even if tables are empty! + +**RIGHT**: Detect schema version by checking column names: +```python +cols = get_table_columns('core_archiveresult') +has_extractor = 'extractor' in cols +if has_extractor: + # Old schema - needs upgrade +``` + +### 4. ❌ DON'T Run Migrations from Data Directories + +**WRONG**: +```bash +cd /path/to/data/dir +python manage.py makemigrations +``` + +**RIGHT**: +```bash +cd archivebox/ # The archivebox package directory +./manage.py makemigrations +``` + +### 5. ❌ DON'T Use WHERE Clauses to Skip SQL Selects + +**WRONG**: +```sql +INSERT INTO new_table SELECT uuid FROM old_table +WHERE EXISTS (SELECT 1 FROM pragma_table_info('old_table') WHERE name='uuid'); +``` + +**Why**: SQLite still evaluates the `uuid` column reference even if WHERE clause is false, causing "no such column" errors. + +**RIGHT**: Use Python to detect schema, then run appropriate SQL: +```python +if 'uuid' in get_table_columns('old_table'): + cursor.execute("INSERT INTO new_table SELECT uuid FROM old_table") +else: + cursor.execute("INSERT INTO new_table SELECT abid as uuid FROM old_table") +``` + +### 6. ❌ DON'T Mix UUID and INTEGER for Tag IDs + +v0.8.6rc0 has Tag.id as UUID, but v0.9.0 needs INTEGER. The conversion must: +1. Create mapping of old UUID → new INTEGER +2. Update core_tag with new IDs +3. Update core_snapshot_tags with new tag_id values + +See `core/migrations/0023_upgrade_to_0_9_0.py` PART 3 for the correct approach. + +### 7. ❌ DON'T Forget SeparateDatabaseAndState + +When you manually change the database with SQL, you MUST tell Django what the final state is: + +```python +migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython(my_sql_function), + ], + state_operations=[ + migrations.RemoveField('archiveresult', 'extractor'), + migrations.RemoveField('archiveresult', 'output'), + ], +) +``` + +Without `state_operations`, Django won't know the old fields are gone and `makemigrations --check` will show unreflected changes. + +### 8. ✅ DO Print Debug Messages + +```python +print(f'Migrating ArchiveResult from v0.7.2 schema...') +print(f'DEBUG: has_uuid={has_uuid}, has_abid={has_abid}, row_count={row_count}') +``` + +This helps diagnose which migration path is being taken. + +### 9. ✅ DO Test All Three Scenarios + +Always test: +1. Fresh install (empty database) +2. v0.7.2 upgrade (12 snapshots, 44 archiveresults, 2 tags) +3. v0.8.6rc0 upgrade (14 snapshots, 0 archiveresults, multiple tags with UUIDs) + +### 10. ✅ DO Verify No Unreflected Migrations + +After all changes: +```bash +cd archivebox/ +./manage.py makemigrations --check +# Should output: No changes detected +``` + +## Current Status + +As of 2025-01-01, migrations have these issues: + +1. ✅ Fresh install works +2. ✅ v0.7.2 → v0.9.0 migration runs without errors +3. ✅ v0.8.6rc0 → v0.9.0 migration runs without errors +4. ❌ **DATA IS LOST**: `extractor` → `plugin` field data not copied +5. ❌ **DATA IS LOST**: `output` → `output_str` field data not copied +6. ❌ Timestamps (added/updated → bookmarked_at/created_at/modified_at) may have wrong values +7. ❌ Tag relationships may be broken after UUID → INTEGER conversion + +## Files That Need Fixing + +1. **core/migrations/0023_upgrade_to_0_9_0.py** + - Line 42-58: CREATE TABLE should use OLD field names (extractor, output, added, updated) + - Lines 64-88: INSERT SELECT should just copy data as-is, no field renaming yet + - Remove all references to plugin, output_str, bookmarked_at, created_at - these are added by 0025 + +2. **core/migrations/0025_...py** + - Add RunPython operation AFTER all AddField operations + - This RunPython should copy: extractor→plugin, output→output_str, added→bookmarked_at/created_at, updated→modified_at + - Fix syntax error on line 28: `{extractor" in cols}` → `{"extractor" in cols}` + +3. **crawls/migrations/0002_upgrade_from_0_8_6.py** + - Already correctly handles conditional upgrade based on schema detection + - No changes needed if crawls table data isn't critical + +## Next Steps + +1. Fix core/migrations/0023 to preserve OLD field names +2. Fix core/migrations/0025 to copy data from old → new fields after AddField +3. Remove debug print statements (lines with `print(f'DEBUG:...`) +4. Test all three scenarios +5. Verify data integrity with SQL queries above +6. Run `./manage.py makemigrations --check` to ensure no unreflected changes + +## Reference: Field Mappings + +| Old Field (v0.7.2/v0.8.6) | New Field (v0.9.0) | Notes | +|---------------------------|-------------------|--------| +| `extractor` | `plugin` | Rename | +| `output` | `output_str` | Rename | +| `added` | `bookmarked_at` | Rename + also use for `created_at` | +| `updated` | `modified_at` | Rename | +| `abid` | `uuid` | v0.8.6 only, field rename | +| Tag.id (UUID) | Tag.id (INTEGER) | v0.8.6 only, type conversion | +| `seed_id` | `urls` | Crawl table, v0.8.6 only | +| `persona` (VARCHAR) | `persona_id` (UUID FK) | Crawl table, v0.8.6 only | + +## Testing Checklist + +- [ ] Fresh install creates correct schema +- [ ] Fresh install has 0 snapshots, 0 archiveresults +- [ ] v0.7.2 migration preserves all 12 snapshots +- [ ] v0.7.2 migration preserves all 44 archiveresults +- [ ] v0.7.2 migration preserves all 2 tags +- [ ] v0.7.2 migration copies `extractor` → `plugin` (check first 5 rows) +- [ ] v0.7.2 migration copies `output` → `output_str` (check first 5 rows) +- [ ] v0.7.2 migration copies `added` → `bookmarked_at` (compare timestamps) +- [ ] v0.7.2 migration copies `updated` → `modified_at` (compare timestamps) +- [ ] v0.8.6 migration preserves all 14 snapshots +- [ ] v0.8.6 migration converts Tag IDs from UUID → INTEGER +- [ ] v0.8.6 migration preserves tag relationships in core_snapshot_tags +- [ ] v0.8.6 migration converts `abid` → `uuid` field +- [ ] `./manage.py makemigrations --check` shows no changes +- [ ] All migrations run without errors +- [ ] `archivebox status` shows correct snapshot/link counts diff --git a/old/TODO_fs_migrations.md b/old/TODO_fs_migrations.md new file mode 100644 index 0000000000..ca5b10a47d --- /dev/null +++ b/old/TODO_fs_migrations.md @@ -0,0 +1,1240 @@ +# Lazy Filesystem Migration System - Implementation TODO + +## Architecture Decision: DB as Single Source of Truth + +**Key Principle**: Only `archivebox update` scans the filesystem (for migration/import). All other commands query the database exclusively. + +- ✅ `archivebox status` - Query DB only (count by status field) +- ✅ `archivebox search` - Query DB only (filter by URL/tags/etc) +- ✅ `archivebox remove` - Query DB + delete directories +- âš ī¸ `archivebox update` - **ONLY command that scans filesystem** (for orphan import + migration) +- ✅ `archivebox init` - Simplified: just apply migrations, no folder scanning + +--- + +## Status: What Already Exists + +### ✅ Core Migration Infrastructure (in `archivebox/core/models.py`) + +**Lines 348-367: Migration on `save()` with transaction wrapper** +- Automatically detects if `fs_migration_needed` +- Walks migration chain: 0.7.0 → 0.8.0 → 0.9.0 +- Calls `_fs_migrate_from_X_to_Y()` methods +- Updates `fs_version` field within transaction + +**Lines 393-419: Migration helper methods** +- `_fs_current_version()` - Gets current ArchiveBox version (normalizes to x.x.0) +- `fs_migration_needed` property - Checks if migration needed +- `_fs_next_version()` - Returns next version in chain +- `_fs_migrate_from_0_7_0_to_0_8_0()` - No-op (same layout) +- `_fs_migrate_from_0_8_0_to_0_9_0()` - **Placeholder (currently no-op at line 427)** ← NEEDS IMPLEMENTATION + +**Lines 540-542: `output_dir` property** +- Currently: `return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)` +- Needs: Check `fs_version`, handle symlinks for backwards compat + +**Line 311: `fs_version` field** +- CharField tracking filesystem version per snapshot +- Default is current ArchiveBox version + +**Lines 266-267: Timestamp uniqueness logic EXISTS** +```python +while self.filter(timestamp=timestamp).exists(): + timestamp = str(float(timestamp) + 1.0) +``` +Already implemented in `create_or_update_from_dict()` at line 241! + +**Lines 120-133: SnapshotQuerySet with `filter_by_patterns()`** +- Already supports filtering by exact/substring/regex/domain/tag/timestamp + +**archivebox/misc/jsonl.py:** +- Line 252: `get_or_create_snapshot()` - Creates snapshot from JSONL record +- Line 281: Uses `Snapshot.objects.create_or_update_from_dict()` internally + +### ✅ Current `archivebox update` Implementation (archivebox/cli/archivebox_update.py) + +**Lines 36-102:** +- Filters snapshots from DB using `filter_by_patterns()` +- Applies before/after timestamp filters +- Queues snapshots via status update +- Starts Orchestrator to process queued snapshots + +**Current behavior:** +- Only queries DB, never scans filesystem ← NEEDS TO BE FIXED +- No orphan detection ← NEEDS TO BE ADDED +- No reconciliation ← NEEDS TO BE ADDED +- No migration triggering ← save() does this automatically + +--- + +## What Needs Implementation + +### Phase 1: Add Methods to Snapshot Model + +File: `archivebox/core/models.py` + +Add these methods after the existing migration methods (around line 457): + +```python +# ========================================================================= +# Path Calculation and Migration Helpers +# ========================================================================= + +@staticmethod +def extract_domain_from_url(url: str) -> str: + """ + Extract domain from URL for 0.9.x path structure. + Uses full hostname with sanitized special chars. + + Examples: + https://example.com:8080 → example.com_8080 + https://sub.example.com → sub.example.com + file:///path → localhost + data:text/html → data + """ + from urllib.parse import urlparse + + try: + parsed = urlparse(url) + + if parsed.scheme in ('http', 'https'): + if parsed.port: + return f"{parsed.hostname}_{parsed.port}".replace(':', '_') + return parsed.hostname or 'unknown' + elif parsed.scheme == 'file': + return 'localhost' + elif parsed.scheme: + return parsed.scheme + else: + return 'unknown' + except Exception: + return 'unknown' + +def get_storage_path_for_version(self, version: str) -> Path: + """ + Calculate storage path for specific filesystem version. + Centralizes path logic so it's reusable. + + 0.7.x/0.8.x: archive/{timestamp} + 0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ + """ + from datetime import datetime + + if version in ('0.7.0', '0.8.0'): + return CONSTANTS.ARCHIVE_DIR / self.timestamp + + elif version in ('0.9.0', '1.0.0'): + username = self.created_by.username if self.created_by else 'unknown' + + # Use created_at for date grouping (fallback to timestamp) + if self.created_at: + date_str = self.created_at.strftime('%Y%m%d') + else: + date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d') + + domain = self.extract_domain_from_url(self.url) + + return ( + CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' / + date_str / domain / str(self.id) + ) + else: + # Unknown version - use current + return self.get_storage_path_for_version(self._fs_current_version()) + +# ========================================================================= +# Loading and Creation from Filesystem (Used by archivebox update ONLY) +# ========================================================================= + +@classmethod +def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: + """ + Load existing Snapshot from DB by reading index.json. + + Reads index.json, extracts url+timestamp, queries DB. + Returns existing Snapshot or None if not found/invalid. + Does NOT create new snapshots. + + ONLY used by: archivebox update (for orphan detection) + """ + import json + + index_path = snapshot_dir / 'index.json' + if not index_path.exists(): + return None + + try: + with open(index_path) as f: + data = json.load(f) + except: + return None + + url = data.get('url') + if not url: + return None + + # Get timestamp - prefer index.json, fallback to folder name + timestamp = cls._select_best_timestamp( + index_timestamp=data.get('timestamp'), + folder_name=snapshot_dir.name + ) + + if not timestamp: + return None + + # Look up existing + try: + return cls.objects.get(url=url, timestamp=timestamp) + except cls.DoesNotExist: + return None + except cls.MultipleObjectsReturned: + # Should not happen with unique constraint + return cls.objects.filter(url=url, timestamp=timestamp).first() + +@classmethod +def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: + """ + Create new Snapshot from orphaned directory. + + Validates timestamp, ensures uniqueness. + Returns new UNSAVED Snapshot or None if invalid. + + ONLY used by: archivebox update (for orphan import) + """ + import json + from archivebox.base_models.models import get_or_create_system_user_pk + + index_path = snapshot_dir / 'index.json' + if not index_path.exists(): + return None + + try: + with open(index_path) as f: + data = json.load(f) + except: + return None + + url = data.get('url') + if not url: + return None + + # Get and validate timestamp + timestamp = cls._select_best_timestamp( + index_timestamp=data.get('timestamp'), + folder_name=snapshot_dir.name + ) + + if not timestamp: + return None + + # Ensure uniqueness (reuses existing logic from create_or_update_from_dict) + timestamp = cls._ensure_unique_timestamp(url, timestamp) + + # Detect version + fs_version = cls._detect_fs_version_from_index(data) + + return cls( + url=url, + timestamp=timestamp, + title=data.get('title', ''), + fs_version=fs_version, + created_by_id=get_or_create_system_user_pk(), + ) + +@staticmethod +def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]: + """ + Select best timestamp from index.json vs folder name. + + Validates range (1995-2035). + Prefers index.json if valid. + """ + def is_valid_timestamp(ts): + try: + ts_int = int(float(ts)) + # 1995-01-01 to 2035-12-31 + return 788918400 <= ts_int <= 2082758400 + except: + return False + + index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False + folder_valid = is_valid_timestamp(folder_name) + + if index_valid: + return str(int(float(index_timestamp))) + elif folder_valid: + return str(int(float(folder_name))) + else: + return None + +@classmethod +def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str: + """ + Ensure timestamp is globally unique. + If collision with different URL, increment by 1 until unique. + + NOTE: Logic already exists in create_or_update_from_dict (line 266-267) + This is just an extracted, reusable version. + """ + while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists(): + timestamp = str(int(float(timestamp)) + 1) + return timestamp + +@staticmethod +def _detect_fs_version_from_index(data: dict) -> str: + """ + Detect fs_version from index.json structure. + + - Has fs_version field: use it + - Has history dict: 0.7.0 + - Has archive_results list: 0.8.0 + - Default: 0.7.0 + """ + if 'fs_version' in data: + return data['fs_version'] + if 'history' in data and 'archive_results' not in data: + return '0.7.0' + if 'archive_results' in data: + return '0.8.0' + return '0.7.0' + +# ========================================================================= +# Index.json Reconciliation +# ========================================================================= + +def reconcile_with_index_json(self): + """ + Merge index.json with DB. DB is source of truth. + + - Title: longest non-URL + - Tags: union + - ArchiveResults: keep both (by extractor+start_ts) + + Writes back in 0.9.x format. + + Used by: archivebox update (to sync index.json with DB) + """ + import json + + index_path = Path(self.output_dir) / 'index.json' + + index_data = {} + if index_path.exists(): + try: + with open(index_path) as f: + index_data = json.load(f) + except: + pass + + # Merge title + self._merge_title_from_index(index_data) + + # Merge tags + self._merge_tags_from_index(index_data) + + # Merge ArchiveResults + self._merge_archive_results_from_index(index_data) + + # Write back + self.write_index_json() + +def _merge_title_from_index(self, index_data: dict): + """Merge title - prefer longest non-URL title.""" + index_title = index_data.get('title', '').strip() + db_title = self.title or '' + + candidates = [t for t in [index_title, db_title] if t and t != self.url] + if candidates: + best_title = max(candidates, key=len) + if self.title != best_title: + self.title = best_title + +def _merge_tags_from_index(self, index_data: dict): + """Merge tags - union of both sources.""" + from django.db import transaction + + index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set() + index_tags = {t.strip() for t in index_tags if t.strip()} + + db_tags = set(self.tags.values_list('name', flat=True)) + + new_tags = index_tags - db_tags + if new_tags: + with transaction.atomic(): + for tag_name in new_tags: + tag, _ = Tag.objects.get_or_create(name=tag_name) + self.tags.add(tag) + +def _merge_archive_results_from_index(self, index_data: dict): + """Merge ArchiveResults - keep both (by extractor+start_ts).""" + existing = { + (ar.extractor, ar.start_ts): ar + for ar in ArchiveResult.objects.filter(snapshot=self) + } + + # Handle 0.8.x format (archive_results list) + for result_data in index_data.get('archive_results', []): + self._create_archive_result_if_missing(result_data, existing) + + # Handle 0.7.x format (history dict) + if 'history' in index_data and isinstance(index_data['history'], dict): + for extractor, result_list in index_data['history'].items(): + if isinstance(result_list, list): + for result_data in result_list: + result_data['extractor'] = extractor + self._create_archive_result_if_missing(result_data, existing) + +def _create_archive_result_if_missing(self, result_data: dict, existing: dict): + """Create ArchiveResult if not already in DB.""" + from dateutil import parser + import json + + extractor = result_data.get('extractor', '') + if not extractor: + return + + start_ts = None + if result_data.get('start_ts'): + try: + start_ts = parser.parse(result_data['start_ts']) + except: + pass + + if (extractor, start_ts) in existing: + return + + try: + end_ts = None + if result_data.get('end_ts'): + try: + end_ts = parser.parse(result_data['end_ts']) + except: + pass + + ArchiveResult.objects.create( + snapshot=self, + extractor=extractor, + status=result_data.get('status', 'failed'), + output_str=result_data.get('output', ''), + cmd=result_data.get('cmd', []), + pwd=result_data.get('pwd', str(self.output_dir)), + start_ts=start_ts, + end_ts=end_ts, + created_by=self.created_by, + ) + except: + pass + +def write_index_json(self): + """Write index.json in 0.9.x format.""" + import json + + index_path = Path(self.output_dir) / 'index.json' + + data = { + 'url': self.url, + 'timestamp': self.timestamp, + 'title': self.title or '', + 'tags': ','.join(sorted(self.tags.values_list('name', flat=True))), + 'fs_version': self.fs_version, + 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, + 'created_at': self.created_at.isoformat() if self.created_at else None, + 'archive_results': [ + { + 'extractor': ar.extractor, + 'status': ar.status, + 'start_ts': ar.start_ts.isoformat() if ar.start_ts else None, + 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None, + 'output': ar.output_str or '', + 'cmd': ar.cmd if isinstance(ar.cmd, list) else [], + 'pwd': ar.pwd, + } + for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts') + ], + } + + index_path.parent.mkdir(parents=True, exist_ok=True) + with open(index_path, 'w') as f: + json.dump(data, f, indent=2, sort_keys=True) + +# ========================================================================= +# Snapshot Utilities +# ========================================================================= + +@staticmethod +def move_directory_to_invalid(snapshot_dir: Path): + """ + Move invalid directory to data/invalid/YYYYMMDD/. + + Used by: archivebox update (when encountering invalid directories) + """ + from datetime import datetime + import shutil + + invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d') + invalid_dir.mkdir(parents=True, exist_ok=True) + + dest = invalid_dir / snapshot_dir.name + counter = 1 + while dest.exists(): + dest = invalid_dir / f"{snapshot_dir.name}_{counter}" + counter += 1 + + try: + shutil.move(str(snapshot_dir), str(dest)) + except: + pass + +@classmethod +def find_and_merge_duplicates(cls) -> int: + """ + Find and merge snapshots with same url:timestamp. + Returns count of duplicate sets merged. + + Used by: archivebox update (Phase 3: deduplication) + """ + from django.db.models import Count + + duplicates = ( + cls.objects + .values('url', 'timestamp') + .annotate(count=Count('id')) + .filter(count__gt=1) + ) + + merged = 0 + for dup in duplicates.iterator(): + snapshots = list( + cls.objects + .filter(url=dup['url'], timestamp=dup['timestamp']) + .order_by('created_at') # Keep oldest + ) + + if len(snapshots) > 1: + try: + cls._merge_snapshots(snapshots) + merged += 1 + except: + pass + + return merged + +@classmethod +def _merge_snapshots(cls, snapshots: list['Snapshot']): + """ + Merge exact duplicates. + Keep oldest, union files + ArchiveResults. + """ + import shutil + + keeper = snapshots[0] + duplicates = snapshots[1:] + + keeper_dir = Path(keeper.output_dir) + + for dup in duplicates: + dup_dir = Path(dup.output_dir) + + # Merge files + if dup_dir.exists() and dup_dir != keeper_dir: + for dup_file in dup_dir.rglob('*'): + if not dup_file.is_file(): + continue + + rel = dup_file.relative_to(dup_dir) + keeper_file = keeper_dir / rel + + if not keeper_file.exists(): + keeper_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(dup_file, keeper_file) + + try: + shutil.rmtree(dup_dir) + except: + pass + + # Merge tags + for tag in dup.tags.all(): + keeper.tags.add(tag) + + # Move ArchiveResults + ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper) + + # Delete + dup.delete() +``` + +### Phase 2: Update `output_dir` Property + +File: `archivebox/core/models.py` line 540 + +Replace current implementation: + +```python +@cached_property +def output_dir(self): + """The filesystem path to the snapshot's output directory.""" + import os + + current_path = self.get_storage_path_for_version(self.fs_version) + + if current_path.exists(): + return str(current_path) + + # Check for backwards-compat symlink + old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + if old_path.is_symlink(): + return str(Path(os.readlink(old_path)).resolve()) + elif old_path.exists(): + return str(old_path) + + return str(current_path) +``` + +### Phase 3: Implement Real Migration + +File: `archivebox/core/models.py` line 427 + +Replace the placeholder `_fs_migrate_from_0_8_0_to_0_9_0()`: + +```python +def _fs_migrate_from_0_8_0_to_0_9_0(self): + """ + Migrate from flat to nested structure. + + 0.8.x: archive/{timestamp}/ + 0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/ + + Transaction handling: + 1. Copy files INSIDE transaction + 2. Create symlink INSIDE transaction + 3. Update fs_version INSIDE transaction (done by save()) + 4. Exit transaction (DB commit) + 5. Delete old files OUTSIDE transaction (after commit) + """ + import shutil + from django.db import transaction + + old_dir = self.get_storage_path_for_version('0.8.0') + new_dir = self.get_storage_path_for_version('0.9.0') + + if not old_dir.exists() or old_dir == new_dir or new_dir.exists(): + return + + new_dir.mkdir(parents=True, exist_ok=True) + + # Copy all files (idempotent) + for old_file in old_dir.rglob('*'): + if not old_file.is_file(): + continue + + rel_path = old_file.relative_to(old_dir) + new_file = new_dir / rel_path + + # Skip if already copied + if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size: + continue + + new_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(old_file, new_file) + + # Verify all copied + old_files = {f.relative_to(old_dir): f.stat().st_size + for f in old_dir.rglob('*') if f.is_file()} + new_files = {f.relative_to(new_dir): f.stat().st_size + for f in new_dir.rglob('*') if f.is_file()} + + if old_files.keys() != new_files.keys(): + missing = old_files.keys() - new_files.keys() + raise Exception(f"Migration incomplete: missing {missing}") + + # Create backwards-compat symlink (INSIDE transaction) + symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + if symlink_path.is_symlink(): + symlink_path.unlink() + + if not symlink_path.exists() or symlink_path == old_dir: + symlink_path.symlink_to(new_dir, target_is_directory=True) + + # Schedule old directory deletion AFTER transaction commits + transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir)) + +def _cleanup_old_migration_dir(self, old_dir: Path): + """ + Delete old directory after successful migration. + Called via transaction.on_commit() after DB commit succeeds. + """ + import shutil + import logging + + if old_dir.exists() and not old_dir.is_symlink(): + try: + shutil.rmtree(old_dir) + except Exception as e: + # Log but don't raise - migration succeeded, this is just cleanup + logging.getLogger('archivebox.migration').warning( + f"Could not remove old migration directory {old_dir}: {e}" + ) +``` + +### Phase 4: Add Timestamp Uniqueness Constraint + +File: `archivebox/core/models.py` - Add to `Snapshot.Meta` class (around line 330): + +```python +class Meta(TypedModelMeta): + verbose_name = "Snapshot" + verbose_name_plural = "Snapshots" + constraints = [ + # Allow same URL in different crawls, but not duplicates within same crawl + models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + # Global timestamp uniqueness for 1:1 symlink mapping + models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), + ] +``` + +Then create migration: +```bash +python -m archivebox manage makemigrations core +``` + +### Phase 5: Rewrite `archivebox update` + +File: `archivebox/cli/archivebox_update.py` + +Replace entire file: + +```python +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' + +import os +import time +import rich_click as click + +from typing import Iterable +from pathlib import Path + +from archivebox.misc.util import enforce_types, docstring + + +@enforce_types +def update(filter_patterns: Iterable[str] = (), + filter_type: str = 'exact', + before: float | None = None, + after: float | None = None, + resume: str | None = None, + batch_size: int = 100, + continuous: bool = False) -> None: + """ + Update snapshots: import orphans, reconcile, and re-run failed extractors. + + Two-phase operation: + - Phase 1: Scan archive/ for orphaned snapshots (skip symlinks) + - Phase 2: Process all DB snapshots (reconcile + re-queue for archiving) + - Phase 3: Deduplicate exact duplicates + + With filters: Only phase 2 (DB query), no filesystem scan. + Without filters: All phases (full update). + """ + + from rich import print + from archivebox.config.django import setup_django + setup_django() + + from archivebox.core.models import Snapshot + from django.utils import timezone + + while True: + if filter_patterns or before or after: + # Filtered mode: query DB only + print('[*] Processing filtered snapshots from database...') + stats = process_filtered_snapshots( + filter_patterns=filter_patterns, + filter_type=filter_type, + before=before, + after=after, + batch_size=batch_size + ) + print_stats(stats) + else: + # Full mode: import orphans + process DB + deduplicate + stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0} + + print('[*] Phase 1: Scanning archive/ for orphaned snapshots...') + stats_combined['phase1'] = import_orphans_from_archive( + resume_from=resume, + batch_size=batch_size + ) + + print('[*] Phase 2: Processing all database snapshots...') + stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size) + + print('[*] Phase 3: Deduplicating...') + stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates() + + print_combined_stats(stats_combined) + + if not continuous: + break + + print('[yellow]Sleeping 60s before next pass...[/yellow]') + time.sleep(60) + resume = None + + +def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict: + """ + Scan archive/ for orphaned snapshots. + Skip symlinks (already migrated). + Create DB records and trigger migration on save(). + """ + from archivebox.core.models import Snapshot + from archivebox.config import CONSTANTS + from django.db import transaction + + stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0} + + archive_dir = CONSTANTS.ARCHIVE_DIR + if not archive_dir.exists(): + return stats + + print('[*] Scanning and sorting by modification time...') + + # Scan and sort by mtime (newest first) + # Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries) + entries = [ + (e.stat().st_mtime, e.path) + for e in os.scandir(archive_dir) + if e.is_dir(follow_symlinks=False) # Skip symlinks + ] + entries.sort(reverse=True) # Newest first + print(f'[*] Found {len(entries)} directories to check') + + for mtime, entry_path in entries: + entry_path = Path(entry_path) + + # Resume from timestamp if specified + if resume_from and entry_path.name < resume_from: + continue + + stats['processed'] += 1 + + # Check if already in DB + snapshot = Snapshot.load_from_directory(entry_path) + if snapshot: + continue # Already in DB, skip + + # Not in DB - create orphaned snapshot + snapshot = Snapshot.create_from_directory(entry_path) + if not snapshot: + # Invalid directory + Snapshot.move_directory_to_invalid(entry_path) + stats['invalid'] += 1 + print(f" [{stats['processed']}] Invalid: {entry_path.name}") + continue + + needs_migration = snapshot.fs_migration_needed + + snapshot.save() # Creates DB record + triggers migration + + stats['imported'] += 1 + if needs_migration: + stats['migrated'] += 1 + print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}") + else: + print(f" [{stats['processed']}] Imported: {entry_path.name}") + + if stats['processed'] % batch_size == 0: + transaction.commit() + + transaction.commit() + return stats + + +def process_all_db_snapshots(batch_size: int = 100) -> dict: + """ + Process all snapshots in DB. + Reconcile index.json and queue for archiving. + """ + from archivebox.core.models import Snapshot + from django.db import transaction + from django.utils import timezone + + stats = {'processed': 0, 'reconciled': 0, 'queued': 0} + + total = Snapshot.objects.count() + print(f'[*] Processing {total} snapshots from database...') + + for snapshot in Snapshot.objects.iterator(): + # Reconcile index.json with DB + snapshot.reconcile_with_index_json() + + # Queue for archiving (state machine will handle it) + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save() + + stats['reconciled'] += 1 + stats['queued'] += 1 + stats['processed'] += 1 + + if stats['processed'] % batch_size == 0: + transaction.commit() + print(f" [{stats['processed']}/{total}] Processed...") + + transaction.commit() + return stats + + +def process_filtered_snapshots( + filter_patterns: Iterable[str], + filter_type: str, + before: float | None, + after: float | None, + batch_size: int +) -> dict: + """Process snapshots matching filters (DB query only).""" + from archivebox.core.models import Snapshot + from django.db import transaction + from django.utils import timezone + from datetime import datetime + + stats = {'processed': 0, 'reconciled': 0, 'queued': 0} + + snapshots = Snapshot.objects.all() + + if filter_patterns: + snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type) + + if before: + snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before)) + if after: + snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after)) + + total = snapshots.count() + print(f'[*] Found {total} matching snapshots') + + for snapshot in snapshots.iterator(): + # Reconcile index.json with DB + snapshot.reconcile_with_index_json() + + # Queue for archiving + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save() + + stats['reconciled'] += 1 + stats['queued'] += 1 + stats['processed'] += 1 + + if stats['processed'] % batch_size == 0: + transaction.commit() + print(f" [{stats['processed']}/{total}] Processed...") + + transaction.commit() + return stats + + +def print_stats(stats: dict): + """Print statistics for filtered mode.""" + from rich import print + + print(f""" +[green]Update Complete[/green] + Processed: {stats['processed']} + Reconciled: {stats['reconciled']} + Queued: {stats['queued']} +""") + + +def print_combined_stats(stats_combined: dict): + """Print statistics for full mode.""" + from rich import print + + s1 = stats_combined['phase1'] + s2 = stats_combined['phase2'] + + print(f""" +[green]Archive Update Complete[/green] + +Phase 1 (Import Orphans): + Checked: {s1.get('processed', 0)} + Imported: {s1.get('imported', 0)} + Migrated: {s1.get('migrated', 0)} + Invalid: {s1.get('invalid', 0)} + +Phase 2 (Process DB): + Processed: {s2.get('processed', 0)} + Reconciled: {s2.get('reconciled', 0)} + Queued: {s2.get('queued', 0)} + +Phase 3 (Deduplicate): + Merged: {stats_combined['deduplicated']} +""") + + +@click.command() +@click.option('--resume', type=str, help='Resume from timestamp') +@click.option('--before', type=float, help='Only snapshots before timestamp') +@click.option('--after', type=float, help='Only snapshots after timestamp') +@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact') +@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots') +@click.option('--continuous', is_flag=True, help='Run continuously as background worker') +@click.argument('filter_patterns', nargs=-1) +@docstring(update.__doc__) +def main(**kwargs): + update(**kwargs) + + +if __name__ == '__main__': + main() +``` + +### Phase 6: Simplify `archivebox init` + +File: `archivebox/cli/archivebox_init.py` + +Remove lines 24, 113-150 (folder status function usage): + +```python +# DELETE line 24: +from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders + +# DELETE lines 113-150 (folder scanning logic): +# Replace with simple message: +print(' > Run "archivebox update" to import any orphaned snapshot directories') +``` + +Simplified logic: +- Create directory structure +- Apply migrations +- **Don't scan for orphans** (let `archivebox update` handle it) + +### Phase 7: Simplify `archivebox search` + +File: `archivebox/cli/archivebox_search.py` + +Remove lines 65-96 (all folder status imports and `list_folders()` function): + +```python +# DELETE lines 65-96 +# DELETE STATUS_CHOICES with 'valid', 'invalid', 'orphaned', 'corrupted', 'unrecognized' + +# Keep only: 'indexed', 'archived', 'unarchived' +STATUS_CHOICES = ['indexed', 'archived', 'unarchived'] +``` + +Update `search()` function to query DB directly: + +```python +@enforce_types +def search(filter_patterns: list[str] | None=None, + filter_type: str='substring', + status: str='indexed', + before: float | None=None, + after: float | None=None, + sort: str | None=None, + json: bool=False, + html: bool=False, + csv: str | None=None, + with_headers: bool=False): + """List, filter, and export information about archive entries""" + + from archivebox.core.models import Snapshot + + if with_headers and not (json or html or csv): + stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') + raise SystemExit(2) + + # Query DB directly + snapshots = Snapshot.objects.all() + + if filter_patterns: + snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type) + + if status == 'archived': + snapshots = snapshots.filter(downloaded_at__isnull=False) + elif status == 'unarchived': + snapshots = snapshots.filter(downloaded_at__isnull=True) + # 'indexed' = all snapshots (no filter) + + if before: + from datetime import datetime + snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before)) + if after: + from datetime import datetime + snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after)) + + if sort: + snapshots = snapshots.order_by(sort) + + # Export to requested format + if json: + output = snapshots.to_json(with_headers=with_headers) + elif html: + output = snapshots.to_html(with_headers=with_headers) + elif csv: + output = snapshots.to_csv(cols=csv.split(','), header=with_headers) + else: + from archivebox.misc.logging_util import printable_folders + # Convert to dict for printable_folders + folders = {s.output_dir: s for s in snapshots} + output = printable_folders(folders, with_headers) + + print(output) + return output +``` + +### Phase 8: Delete Folder Status Functions + +File: `archivebox/misc/folders.py` + +Delete lines 23-186 (all status checking functions): + +```python +# DELETE these functions entirely: +# - _is_valid_snapshot() +# - _is_corrupt_snapshot() +# - get_indexed_folders() +# - get_archived_folders() +# - get_unarchived_folders() +# - get_present_folders() +# - get_valid_folders() +# - get_invalid_folders() +# - get_duplicate_folders() +# - get_orphaned_folders() +# - get_corrupted_folders() +# - get_unrecognized_folders() +``` + +Keep only `fix_invalid_folder_locations()` (used by archivebox init for one-time cleanup): + +```python +""" +Folder utilities for ArchiveBox. + +Note: This file only contains legacy cleanup utilities. +The DB is the single source of truth - use Snapshot.objects queries for all status checks. +""" + +__package__ = 'archivebox.misc' + +import os +import json +import shutil +from pathlib import Path +from typing import Tuple, List + +from archivebox.config import DATA_DIR, CONSTANTS +from archivebox.misc.util import enforce_types + + +@enforce_types +def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]: + """ + Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json. + + This is only used during 'archivebox init' for one-time cleanup of misnamed directories. + After this runs once, 'archivebox update' handles all filesystem operations. + """ + fixed = [] + cant_fix = [] + for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME): + if entry.is_dir(follow_symlinks=True): + index_path = Path(entry.path) / 'index.json' + if index_path.exists(): + try: + with open(index_path, 'r') as f: + data = json.load(f) + timestamp = data.get('timestamp') + url = data.get('url') + except Exception: + continue + + if not timestamp: + continue + + if not entry.path.endswith(f'/{timestamp}'): + dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp + if dest.exists(): + cant_fix.append(entry.path) + else: + shutil.move(entry.path, str(dest)) + fixed.append(str(dest)) + return fixed, cant_fix +``` + +--- + +## Testing Plan + +1. **Test migration idempotency:** + ```bash + # Interrupt migration mid-way + # Re-run - should resume seamlessly + ``` + +2. **Test orphan import:** + ```bash + # Create orphaned directory manually + # Run archivebox update + # Verify imported and migrated + ``` + +3. **Test deduplication:** + ```bash + # Create two snapshots with same url:timestamp + # Run archivebox update + # Verify merged + ``` + +4. **Test timestamp uniqueness:** + ```bash + # Try to create snapshots with colliding timestamps + # Verify auto-increment + ``` + +5. **Test filtered update:** + ```bash + archivebox update --after 1234567890 + # Should only process DB, no filesystem scan + ``` + +6. **Test continuous mode:** + ```bash + archivebox update --continuous + # Should run in loop, prioritize newest entries + ``` + +7. **Test DB-only commands:** + ```bash + archivebox search --status archived + archivebox search example.com --filter-type substring + archivebox remove example.com + # All should query DB only, no filesystem scanning + ``` + +--- + +## Implementation Checklist + +- [x] Add all new methods to `Snapshot` model (Phase 1) +- [x] Update `output_dir` property (Phase 2) +- [x] Implement real `_fs_migrate_from_0_8_0_to_0_9_0()` (Phase 3) +- [x] Add `_cleanup_old_migration_dir()` helper (Phase 3) +- [x] Add timestamp uniqueness constraint (Phase 4) +- [x] Create database migration for constraint (Phase 4) - Created: `0032_alter_archiveresult_binary_and_more.py` +- [x] Rewrite `archivebox/cli/archivebox_update.py` (Phase 5) +- [x] Simplify `archivebox/cli/archivebox_init.py` (Phase 6) +- [x] Simplify `archivebox/cli/archivebox_search.py` (Phase 7) +- [x] Delete folder status functions from `archivebox/misc/folders.py` (Phase 8) +- [x] Update migration tests (test_migrations_08_to_09.py) +- [x] Update update command tests (tests/test_update.py) +- [ ] Run tests to verify implementation +- [ ] Test migration on real 0.8.x collection +- [ ] Test orphan import in production +- [ ] Test deduplication in production +- [ ] Test filtered vs full mode in production +- [ ] Test continuous mode in production diff --git a/old/TODO_hook_architecture.md b/old/TODO_hook_architecture.md new file mode 100755 index 0000000000..bb6b87cc35 --- /dev/null +++ b/old/TODO_hook_architecture.md @@ -0,0 +1,1976 @@ +# ArchiveBox Hook Architecture + +## Core Design Pattern + +**CRITICAL**: All hooks must follow this unified architecture. This pattern applies to ALL models: Crawl, Dependency, Snapshot, ArchiveResult, etc. + +### The Flow + +``` +1. Model.run() discovers and executes hooks +2. Hooks emit JSONL to stdout +3. Model.run() parses JSONL and creates DB records +4. New DB records trigger their own Model.run() +5. Cycle repeats +``` + +**Example Flow:** +``` +Crawl.run() + → runs on_Crawl__* hooks + → hooks emit JSONL: {type: 'Dependency', bin_name: 'wget', ...} + → Crawl.run() creates Dependency record in DB + → Dependency.run() is called automatically + → runs on_Dependency__* hooks + → hooks emit JSONL: {type: 'Binary', name: 'wget', ...} + → Dependency.run() creates Binary record in DB +``` + +### Golden Rules + +1. **Model.run() executes hooks directly** - No helper methods in statemachines. Statemachine just calls Model.run(). + +2. **Hooks emit JSONL** - Any line starting with `{` that has a `type` field creates/updates that model. + ```python + print(json.dumps({'type': 'Dependency', 'bin_name': 'wget', ...})) + print(json.dumps({'type': 'Binary', 'name': 'wget', ...})) + ``` + +3. **JSONL fields = Model fields** - JSONL keys must match Django model field names exactly. No transformation. + ```python + # ✅ CORRECT - matches Dependency model + {'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}} + + # ❌ WRONG - uses different field names + {'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'overrides': {...}} + ``` + +4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery. + ```python + # ✅ CORRECT - discovers all on_Dependency hooks dynamically + run_hooks(event_name='Dependency', ...) + + # ❌ WRONG - hardcodes provider list + for provider in ['pip', 'npm', 'apt', 'brew']: + run_hooks(event_name=f'Dependency__install_using_{provider}_provider', ...) + ``` + +5. **Trust abx-pkg** - Never use `shutil.which()`, `subprocess.run([bin, '--version'])`, or manual hash calculation. + ```python + # ✅ CORRECT - abx-pkg handles everything + from abx_pkg import Binary, PipProvider, EnvProvider + binary = Binary(name='wget', binproviders=[PipProvider(), EnvProvider()]).load() + # binary.abspath, binary.version, binary.sha256 are all populated automatically + + # ❌ WRONG - manual detection + abspath = shutil.which('wget') + version = subprocess.run(['wget', '--version'], ...).stdout + ``` + +6. **Hooks check if they can handle requests** - Each hook decides internally if it can handle the dependency. + ```python + # In on_Dependency__install_using_pip_provider.py + if bin_providers != '*' and 'pip' not in bin_providers.split(','): + sys.exit(0) # Can't handle this, exit cleanly + ``` + +7. **Minimal transformation** - Statemachine/Model.run() should do minimal JSONL parsing, just create records. + ```python + # ✅ CORRECT - simple JSONL parsing + obj = json.loads(line) + if obj.get('type') == 'Dependency': + Dependency.objects.create(**obj) + + # ❌ WRONG - complex transformation logic + if obj.get('type') == 'Dependency': + dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields + dep.overrides = transform_overrides(obj['overrides']) # transforming data + ``` + +### Pattern Consistency + +Follow the same pattern as `ArchiveResult.run()` (archivebox/core/models.py:1030): + +```python +def run(self): + """Execute this Model by running hooks and processing JSONL output.""" + + # 1. Discover hooks + hook = discover_hook_for_model(self) + + # 2. Run hook + results = run_hook(hook, output_dir=..., ...) + + # 3. Parse JSONL and update self + for line in results['stdout'].splitlines(): + obj = json.loads(line) + if obj.get('type') == self.__class__.__name__: + self.status = obj.get('status') + self.output = obj.get('output') + # ... apply other fields + + # 4. Create side-effect records + for line in results['stdout'].splitlines(): + obj = json.loads(line) + if obj.get('type') != self.__class__.__name__: + create_record_from_jsonl(obj) # Creates Binary, etc. + + self.save() +``` + +### Install Hook Pattern (on_Crawl__00_install_*.py) + +**Purpose**: Check if binary exists, emit Dependency if not found. + +```python +#!/usr/bin/env python3 +import sys +import json + +def find_wget() -> dict | None: + """Find wget binary using abx-pkg.""" + try: + from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider + + binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'wget', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except Exception: + pass + + return None + +def main(): + result = find_wget() + + if result and result.get('abspath'): + # Binary found - emit Binary and Machine config + print(json.dumps({ + 'type': 'Binary', + 'name': result['name'], + 'abspath': result['abspath'], + 'version': result['version'], + 'sha256': result['sha256'], + 'binprovider': result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/WGET_BINARY', + 'value': result['abspath'], + })) + + sys.exit(0) + else: + # Binary not found - emit Dependency + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'wget', + 'bin_providers': 'apt,brew,env', + 'overrides': {}, # Empty if no special install requirements + })) + print(f"wget binary not found", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() +``` + +**Rules:** +- ✅ Use `Binary(...).load()` from abx-pkg - handles finding binary, version, hash automatically +- ✅ Emit `Binary` JSONL if found +- ✅ Emit `Dependency` JSONL if not found +- ✅ Use `overrides` field matching abx-pkg format: `{'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}` +- ❌ NEVER use `shutil.which()`, `subprocess.run()`, manual version detection, or hash calculation +- ❌ NEVER call package managers (apt, brew, pip, npm) directly + +### Dependency Installation Pattern (on_Dependency__install_*.py) + +**Purpose**: Install binary if not already installed. + +```python +#!/usr/bin/env python3 +import json +import sys +import rich_click as click +from abx_pkg import Binary, PipProvider + +@click.command() +@click.option('--dependency-id', required=True) +@click.option('--bin-name', required=True) +@click.option('--bin-providers', default='*') +@click.option('--overrides', default=None, help="JSON-encoded overrides dict") +def main(dependency_id: str, bin_name: str, bin_providers: str, overrides: str | None): + """Install binary using pip.""" + + # Check if this hook can handle this dependency + if bin_providers != '*' and 'pip' not in bin_providers.split(','): + click.echo(f"pip provider not allowed for {bin_name}", err=True) + sys.exit(0) # Exit cleanly - not an error, just can't handle + + # Parse overrides + overrides_dict = None + if overrides: + try: + full_overrides = json.loads(overrides) + overrides_dict = full_overrides.get('pip', {}) # Extract pip section + except json.JSONDecodeError: + pass + + # Install using abx-pkg + provider = PipProvider() + try: + binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install() + except Exception as e: + click.echo(f"pip install failed: {e}", err=True) + sys.exit(1) + + if not binary.abspath: + sys.exit(1) + + # Emit Binary JSONL + print(json.dumps({ + 'type': 'Binary', + 'name': bin_name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'pip', + 'dependency_id': dependency_id, + })) + + sys.exit(0) + +if __name__ == '__main__': + main() +``` + +**Rules:** +- ✅ Check `bin_providers` parameter - exit cleanly (code 0) if can't handle +- ✅ Parse `overrides` parameter as full dict, extract your provider's section +- ✅ Use `Binary(...).install()` from abx-pkg - handles actual installation +- ✅ Emit `Binary` JSONL on success +- ❌ NEVER hardcode provider names in Model.run() or anywhere else +- ❌ NEVER skip the bin_providers check + +### Model.run() Pattern + +```python +class Dependency(models.Model): + def run(self): + """Execute dependency installation by running all on_Dependency hooks.""" + import json + from pathlib import Path + from django.conf import settings + + # Check if already installed + if self.is_installed: + return self.binaries.first() + + from archivebox.hooks import run_hooks + + # Create output directory + DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd()) + output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}' + output_dir.mkdir(parents=True, exist_ok=True) + + # Build kwargs for hooks + hook_kwargs = { + 'dependency_id': str(self.id), + 'bin_name': self.bin_name, + 'bin_providers': self.bin_providers, + 'overrides': json.dumps(self.overrides) if self.overrides else None, + } + + # Run ALL on_Dependency hooks - each decides if it can handle this + results = run_hooks( + event_name='Dependency', + output_dir=output_dir, + timeout=600, + **hook_kwargs + ) + + # Process results - parse JSONL and create Binary records + for result in results: + if result['returncode'] != 0: + continue + + for line in result['stdout'].strip().split('\n'): + if not line.strip(): + continue + + try: + obj = json.loads(line) + if obj.get('type') == 'Binary': + # Create Binary record - fields match JSONL exactly + if not obj.get('name') or not obj.get('abspath') or not obj.get('version'): + continue + + machine = Machine.current() + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=obj['name'], + defaults={ + 'abspath': obj['abspath'], + 'version': obj['version'], + 'sha256': obj.get('sha256') or '', + 'binprovider': obj.get('binprovider') or 'env', + 'dependency': self, + } + ) + + if self.is_installed: + return binary + + except json.JSONDecodeError: + continue + + return None +``` + +**Rules:** +- ✅ Use `run_hooks(event_name='ModelName', ...)` with model name +- ✅ Pass all relevant data as kwargs (will become --cli-args for hooks) +- ✅ Parse JSONL output directly - each line is a potential record +- ✅ Create records using JSONL fields directly - no transformation +- ✅ Let hooks decide if they can handle the request +- ❌ NEVER hardcode hook names or provider lists +- ❌ NEVER create helper methods for hook execution - just call run_hooks() +- ❌ NEVER transform JSONL data - use it as-is + +--- + +# Background Hooks Implementation Plan + +## Overview + +This plan implements support for long-running background hooks that run concurrently with other extractors, while maintaining proper result collection, cleanup, and state management. + +**Key Changes:** +- Background hooks use `.bg.js`/`.bg.py`/`.bg.sh` suffix +- Hooks output **JSONL** (any line with `{type: 'ModelName', ...}`) +- `run_hook()` is **generic** - just parses JSONL, doesn't know about specific models +- Each `Model.run()` extends records of its own type with computed fields +- ArchiveResult.run() extends ArchiveResult records with `output_files`, `output_size`, etc. +- **No HookResult TypedDict** - just list of dicts with 'type' field +- Binary FK is optional and only set when hook reports cmd +- Split `output` field into `output_str` (human-readable) and `output_json` (structured) +- Add fields: `output_files` (dict), `output_size` (bytes), `output_mimetypes` (CSV) +- External tools (fdupes, ZFS, Btrfs) handle deduplication via filesystem + +**New ArchiveResult Fields:** +```python +# Output fields (replace old 'output' field) +output_str = TextField() # Human-readable summary: "Downloaded 5 files" +output_json = JSONField() # Structured metadata (headers, redirects, etc.) +output_files = JSONField() # Dict: {'index.html': {}, 'style.css': {}} +output_size = BigIntegerField() # Total bytes across all files +output_mimetypes = CharField() # CSV sorted by size: "text/html,text/css,image/png" +``` + +**output_files Structure:** +- **Dict keyed by relative path** (not a list!) +- Values are empty dicts `{}` for now, extensible for future metadata +- Preserves insertion order (Python 3.7+) +- Easy to query: `ArchiveResult.objects.filter(output_files__has_key='index.html')` +- Easy to extend: Add `size`, `hash`, `mime_type` to values later without migration +- **Why not derive size/mimetypes from output_files?** Performance. Total size and mimetype summary are accessed frequently (admin views, sorting, filtering). Aggregating on every access would be slow. We keep summary fields (output_size, output_mimetypes) as denormalized cache for fast reads. + +--- + +## Phase 1: Database Migration + +### Add new fields to ArchiveResult + +```python +# archivebox/core/migrations/00XX_archiveresult_background_hooks.py + +from django.db import migrations, models + +class Migration(migrations.Migration): + dependencies = [ + ('core', 'XXXX_previous_migration'), + ('machine', 'XXXX_latest_machine_migration'), + ] + + operations = [ + # Add new fields (keep old 'output' temporarily for migration) + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField( + blank=True, + help_text='Human-readable output summary (e.g., "Downloaded 5 files")' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField( + null=True, + blank=True, + help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField( + default=dict, + help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField( + default=0, + help_text='Total recursive size in bytes of all output files' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField( + max_length=512, + blank=True, + help_text='CSV of mimetypes sorted by size descending' + ), + ), + + # Add binary FK (optional) + migrations.AddField( + model_name='archiveresult', + name='binary', + field=models.ForeignKey( + 'machine.Binary', + on_delete=models.SET_NULL, + null=True, + blank=True, + help_text='Primary binary used by this hook (optional)' + ), + ), + ] +``` + +### Data Migration for Existing `.output` Field + +```python +# archivebox/core/migrations/00XX_migrate_output_field.py + +from django.db import migrations +import json + +def migrate_output_field(apps, schema_editor): + """ + Migrate existing 'output' field to new split fields. + + Logic: + - If output contains JSON {...}, move to output_json + - If output is a file path and exists in output_files, ensure it's first + - Otherwise, move to output_str + """ + ArchiveResult = apps.get_model('core', 'ArchiveResult') + + for ar in ArchiveResult.objects.all(): + old_output = ar.output or '' + + # Case 1: JSON output + if old_output.strip().startswith('{'): + try: + parsed = json.loads(old_output) + ar.output_json = parsed + ar.output_str = '' + except json.JSONDecodeError: + # Not valid JSON, treat as string + ar.output_str = old_output + + # Case 2: File path (check if it looks like a relative path) + elif '/' in old_output or '.' in old_output: + # Might be a file path - if it's in output_files, it's already there + # output_files is now a dict, so no reordering needed + ar.output_str = old_output # Keep as string for display + + # Case 3: Plain string summary + else: + ar.output_str = old_output + + ar.save(update_fields=['output_str', 'output_json', 'output_files']) + +def reverse_migrate(apps, schema_editor): + """Reverse migration - copy output_str back to output.""" + ArchiveResult = apps.get_model('core', 'ArchiveResult') + + for ar in ArchiveResult.objects.all(): + ar.output = ar.output_str or '' + ar.save(update_fields=['output']) + +class Migration(migrations.Migration): + dependencies = [ + ('core', '00XX_archiveresult_background_hooks'), + ] + + operations = [ + migrations.RunPython(migrate_output_field, reverse_migrate), + + # Now safe to remove old 'output' field + migrations.RemoveField( + model_name='archiveresult', + name='output', + ), + ] +``` + + +--- + +## Phase 2: Hook Output Format Specification + +### Hooks emit single JSON object to stdout + +**Contract:** +- Hook scripts must be executable (chmod +x) and specify their interpreter at the top with a /usr/bin/env shebang line +- Hook emits ONE JSON object with `type: 'ArchiveResult'` +- Hook can provide: `status`, `output_str`, `output_json`, `cmd` (optional) +- Hook should NOT set: `output_files`, `output_size`, `output_mimetypes` (runner calculates these) +- `output_json` should NOT duplicate ArchiveResult fields (no `status`, `start_ts`, etc. in output_json) +- Runner calculates: `output_files`, `output_size`, `output_mimetypes`, `start_ts`, `end_ts`, `binary` FK + +**Example outputs:** + +```javascript +// Simple string output +console.log(JSON.stringify({ + type: 'ArchiveResult', + output_str: 'This is the page title', +})); + +// With structured metadata and optional fields (headers, redirects, etc.) +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: 'Got https://example.com headers', + output_json: {'content-type': 'text/html', 'server': 'nginx', 'status-code': 200, 'content-length': 234235}, +})); + +// With explicit cmd (cmd first arg should match Binary.bin_abspath or XYZ_BINARY env var so ArchiveResult.run() can FK to the Binary) +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: 'Archived with wget', + cmd: ['/some/abspath/to/wget', '-p', '-k', 'https://example.com'] +})); + +// BAD: Don't duplicate ArchiveResult fields in output_json +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_json: { + status: 'succeeded', // ❌ BAD - this should be up a level on ArchiveResult.status, not inside output_json + title: 'the page title', // ❌ BAD - if the extractor's main output is just a string then it belongs in output_str + custom_data: 1234, // ✅ GOOD - custom fields only + }, + output_files: {'index.html': {}}, // ❌ BAD - runner calculates this for us, no need to return it manually +})); +``` + +--- + +## Phase 3: Architecture - Generic run_hook() + +`run_hook()` is a generic JSONL parser - it doesn't know about ArchiveResult, Binary, or any specific model. It just: +1. Executes the hook script +2. Parses JSONL output (any line starting with `{` that has a `type` field) +3. Adds metadata about plugin and hook path +4. Returns list of dicts + +```python +# archivebox/hooks.py + +def run_hook( + script: Path, + output_dir: Path, + timeout: int = 300, + config_objects: Optional[List[Any]] = None, + **kwargs: Any +) -> Optional[List[dict]]: + """ + Execute a hook script and parse JSONL output. + + This function is generic and doesn't know about specific model types. + It just executes the script and parses any JSONL lines with 'type' field. + + Each Model.run() method handles its own record types differently: + - ArchiveResult.run() extends ArchiveResult records with computed fields + - Dependency.run() creates Binary records from hook output + - Crawl.run() can create Dependency records, Snapshots, or Binary records from hook output + + Returns: + List of dicts with 'type' field, each extended with metadata: + [ + { + 'type': 'ArchiveResult', + 'status': 'succeeded', + 'plugin': 'wget', + 'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py', + 'output_str': '...', + # ... other hook-reported fields + }, + { + 'type': 'Binary', + 'name': 'wget', + 'plugin': 'wget', + 'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py', + # ... other hook-reported fields + } + ] + + None if background hook (still running) + """ +``` + +**Key Insight:** Hooks output JSONL. Any line with `{type: 'ModelName', ...}` creates/updates that model. The `type` field determines what gets created. Each Model.run() method decides how to handle records of its own type. + +### Helper: create_model_record() + +```python +# archivebox/hooks.py + +def create_model_record(record: dict) -> Any: + """ + Generic helper to create/update model instances from hook output. + + Args: + record: Dict with 'type' field and model data + + Returns: + Created/updated model instance + """ + from archivebox.machine.models import Binary, Dependency + + model_type = record.pop('type') + + if model_type == 'Binary': + obj, created = Binary.objects.get_or_create(**record) # if model requires custom logic implement Binary.from_jsonl(**record) + return obj + elif model_type == 'Dependency': + obj, created = Dependency.objects.get_or_create(**record) + return obj + # ... Snapshot, ArchiveResult, etc. add more types as needed + else: + raise ValueError(f"Unknown record type: {model_type}") +``` + +--- + +## Phase 4: Plugin Audit & Standardization + +**CRITICAL:** This phase MUST be done FIRST, before updating core code. Do this manually, one plugin at a time. Do NOT batch-update multiple plugins at once. Do NOT skip any plugins or checks. + +**Why First?** Updating plugins to output clean JSONL before changing core code means the transition is safe and incremental. The current run_hook() can continue to work during the plugin updates. + +### 4.1 Install Hook Standardization + +All plugins should follow a consistent pattern for checking and declaring dependencies. + +#### Hook Naming Convention + +**RENAME ALL HOOKS:** +- ❌ OLD: `on_Crawl__*_validate_*.{sh,py,js}` +- ✅ NEW: `on_Crawl__*_install_*.{sh,py,js}` + +Rationale: "install" is clearer than "validate" for what these hooks actually do. + +#### Standard Install Hook Pattern + +**ALL install hooks MUST follow this pattern:** + +1. ✅ Check if Binary already exists for the configured binary +2. ✅ If NOT found, emit a Dependency JSONL record, with overrides if you need to customize install process +3. ❌ NEVER directly call npm, apt, brew, pip, or any package manager +4. ✅ Let bin provider plugins handle actual installation + +**Example Standard Pattern:** + +```python +#!/usr/bin/env python3 +""" +Check for wget binary and emit Dependency if not found. +""" +import os +import sys +import json +from pathlib import Path + +def main(): + # 1. Get configured binary name/path from env + binary_path = os.environ.get('WGET_BINARY', 'wget') + + # 2. Check if Binary exists for this binary + # (In practice, this check happens via database query in the actual implementation) + # For install hooks, we emit a Dependency that the system will process + + # 3. Emit Dependency JSONL if needed + # The bin provider will check Binary and install if missing + dependency = { + 'type': 'Dependency', + 'name': 'wget', + 'bin_name': Path(binary_path).name if '/' in binary_path else binary_path, + 'providers': ['apt', 'brew', 'pkg'], # Priority order + 'abspath': binary_path if binary_path.startswith('/') else None, + } + + print(json.dumps(dependency)) + return 0 + +if __name__ == '__main__': + sys.exit(main()) +``` + +#### Config Variable Handling + +**ALL hooks MUST respect user-configured binary paths:** + +- ✅ Read `XYZ_BINARY` env var (e.g., `WGET_BINARY`, `YTDLP_BINARY`, `CHROME_BINARY`) +- ✅ Support absolute paths: `WGET_BINARY=/usr/local/bin/wget2` +- ✅ Support bin names: `WGET_BINARY=wget2` +- ✅ Check for the CORRECT binary name in Binary +- ✅ If user provides `WGET_BINARY=wget2`, check for `wget2` not `wget` + +**Example Config Handling:** + +```python +# Get configured binary (could be path or name) +binary_path = os.environ.get('WGET_BINARY', 'wget') + +# Extract just the binary name for Binary lookup +if '/' in binary_path: + # Absolute path: /usr/local/bin/wget2 -> wget2 + bin_name = Path(binary_path).name +else: + # Just a name: wget2 -> wget2 + bin_name = binary_path + +# Now check Binary for bin_name (not hardcoded 'wget') +``` + +### 4.2 Snapshot Hook Standardization + +All `on_Snapshot__*.*` hooks must follow the output format specified in **Phase 2**. Key points for implementation: + +#### Output Format Requirements + +**CRITICAL Legacy Issues to Fix:** + +1. ❌ **Remove `RESULT_JSON=` prefix** - old hooks use `console.log('RESULT_JSON=' + ...)` +2. ❌ **Remove extra output lines** - old hooks print VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT= +3. ❌ **Remove `--version` calls** - hooks should NOT run binary version checks +4. ✅ **Output clean JSONL only** - exactly ONE line: `console.log(JSON.stringify(result))` + +**Before (WRONG):** +```javascript +console.log(`VERSION=${version}`); +console.log(`START_TS=${startTime.toISOString()}`); +console.log(`RESULT_JSON=${JSON.stringify(result)}`); +``` + +**After (CORRECT):** +```javascript +console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'Done'})); +``` + +> **See Phase 2 for complete JSONL format specification and examples.** + +#### Using Configured Binaries + +**ALL on_Snapshot hooks MUST:** + +1. ✅ Read the correct `XYZ_BINARY` env var +2. ✅ Use that binary path/name in their commands +3. ✅ Pass cmd in JSONL output for binary FK lookup + +**Example:** + +```javascript +// ✅ CORRECT - uses env var +const wgetBinary = process.env.WGET_BINARY || 'wget'; +const cmd = [wgetBinary, '-p', '-k', url]; + +// Execute command... +const result = execSync(cmd.join(' ')); + +// Report cmd in output for binary FK +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: 'Downloaded page', + cmd: cmd, // ✅ Includes configured binary +})); +``` + +```javascript +// ❌ WRONG - hardcoded binary name +const cmd = ['wget', '-p', '-k', url]; // Ignores WGET_BINARY +``` + +### 4.3 Per-Plugin Checklist + +**For EACH plugin, verify ALL of these:** + +#### Install Hook Checklist + +- [x] Renamed from `on_Crawl__*_validate_*` to `on_Crawl__*_install_*` +- [x] Reads `XYZ_BINARY` env var and handles both absolute paths + bin names +- [x] Emits `{"type": "Dependency", ...}` JSONL (uses configured bin_name) +- [x] Does NOT call npm/apt/brew/pip directly +- [x] Follows standard pattern from section 4.1 + +#### Snapshot Hook Checklist + +- [x] Reads correct `XYZ_BINARY` env var and uses it in cmd +- [x] Outputs EXACTLY ONE JSONL line (NO `RESULT_JSON=` prefix) +- [x] NO extra output lines (VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT=) +- [~] Does NOT run `--version` commands (some hooks still do for compatibility checks) +- [x] Only provides allowed fields (type, status, output_str, output_json, cmd) +- [x] Does NOT include computed fields (see Phase 2 for forbidden fields list) +- [x] Includes `cmd` array with configured binary path (Python hooks) + +### 4.4 Implementation Process + +**MANDATORY PROCESS:** + +1. ✅ List ALL plugins in archivebox/plugins/ +2. ✅ For EACH plugin (DO NOT BATCH): + a. Read ALL hook files in the plugin directory + b. Check install hooks against checklist 4.3 + c. Check snapshot hooks against checklist 4.3 + d. Fix issues one by one + e. Test the plugin hooks + f. Move to next plugin +3. ❌ DO NOT skip any plugins +4. ❌ DO NOT batch-update multiple plugins +5. ❌ DO NOT assume plugins are similar enough to update together + +**Why one-by-one?** +- Each plugin may have unique patterns +- Each plugin may use different languages (sh/py/js) +- Each plugin may have different edge cases +- Batch updates lead to copy-paste errors + +### 4.5 Testing Each Plugin + +After updating each plugin, verify: + +1. ✅ Install hook can be executed: `python3 on_Crawl__01_install_wget.py` +2. ✅ Install hook outputs valid JSONL: `python3 ... | jq .` +3. ✅ Install hook respects `XYZ_BINARY` env var +4. ✅ Snapshot hook can be executed with test URL +5. ✅ Snapshot hook outputs EXACTLY ONE JSONL line +6. ✅ Snapshot hook JSONL parses correctly: `... | jq .type` +7. ✅ Snapshot hook uses configured binary from env + +### 4.6 Common Pitfalls + +When auditing plugins, watch for these common mistakes: + +1. **Hardcoded binary names** - Check `Binary.filter(name='wget')` → should use configured name +2. **Old output format** - Look for `RESULT_JSON=`, `VERSION=`, `START_TS=` lines +3. **Computed fields in output** - Watch for `output_files`, `start_ts`, `duration` in JSONL +4. **Missing config variables** - Ensure hooks read `XYZ_BINARY` env vars +5. **Version checks** - Remove any `--version` command executions + +> See sections 4.1 and 4.2 for detailed before/after examples. + +--- + +## Phase 5: Update run_hook() Implementation + +**Note:** Only do this AFTER Phase 4 (plugin standardization) is complete. By then, all plugins will output clean JSONL and this implementation will work smoothly. + +### Location: `archivebox/hooks.py` + +```python +def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]: + """ + Find Binary for a command, trying abspath first then name. + Only matches binaries on the current machine. + + Args: + cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url']) + machine_id: Current machine ID + + Returns: + Binary ID if found, None otherwise + """ + if not cmd: + return None + + from archivebox.machine.models import Binary + + bin_path_or_name = cmd[0] + + # Try matching by absolute path first + binary = Binary.objects.filter( + abspath=bin_path_or_name, + machine_id=machine_id + ).first() + + if binary: + return str(binary.id) + + # Fallback: match by binary name + bin_name = Path(bin_path_or_name).name + binary = Binary.objects.filter( + name=bin_name, + machine_id=machine_id + ).first() + + return str(binary.id) if binary else None + + +def run_hook( + script: Path, + output_dir: Path, + timeout: int = 300, + config_objects: Optional[List[Any]] = None, + **kwargs: Any +) -> Optional[List[dict]]: + """ + Execute a hook script and parse JSONL output. + + This is a GENERIC function that doesn't know about specific model types. + It just executes and parses JSONL (any line with {type: 'ModelName', ...}). + + Runner responsibilities: + - Detect background hooks (.bg. in filename) + - Capture stdout/stderr to log files + - Parse JSONL output and add plugin metadata + - Clean up log files and PID files + + Hook responsibilities: + - Emit JSONL: {type: 'ArchiveResult', status, output_str, output_json, cmd} + - Can emit multiple types: {type: 'Binary', ...} + - Write actual output files + + Args: + script: Path to hook script + output_dir: Working directory (where output files go) + timeout: Max execution time in seconds + config_objects: Config override objects (Machine, Crawl, Snapshot) + **kwargs: CLI arguments passed to script + + Returns: + List of dicts with 'type' field for foreground hooks + None for background hooks (still running) + """ + import time + from datetime import datetime, timezone + from archivebox.machine.models import Machine + + start_time = time.time() + + # 1. SETUP + is_background = '.bg.' in script.name # Detect .bg.js/.bg.py/.bg.sh + effective_timeout = timeout * 10 if is_background else timeout + + # Infrastructure files (ALL hooks) + stdout_file = output_dir / 'stdout.log' + stderr_file = output_dir / 'stderr.log' + pid_file = output_dir / 'hook.pid' + + # Capture files before execution + files_before = set(output_dir.rglob('*')) if output_dir.exists() else set() + start_ts = datetime.now(timezone.utc) + + # 2. BUILD COMMAND + ext = script.suffix.lower() + if ext == '.sh': + interpreter_cmd = ['bash', str(script)] + elif ext == '.py': + interpreter_cmd = ['python3', str(script)] + elif ext == '.js': + interpreter_cmd = ['node', str(script)] + else: + interpreter_cmd = [str(script)] + + # Build CLI arguments from kwargs + cli_args = [] + for key, value in kwargs.items(): + if key.startswith('_'): + continue + + arg_key = f'--{key.replace("_", "-")}' + if isinstance(value, bool): + if value: + cli_args.append(arg_key) + elif value is not None and value != '': + if isinstance(value, (dict, list)): + cli_args.append(f'{arg_key}={json.dumps(value)}') + else: + str_value = str(value).strip() + if str_value: + cli_args.append(f'{arg_key}={str_value}') + + full_cmd = interpreter_cmd + cli_args + + # 3. SET UP ENVIRONMENT + env = os.environ.copy() + # ... (existing env setup from current run_hook implementation) + + # 4. CREATE OUTPUT DIRECTORY + output_dir.mkdir(parents=True, exist_ok=True) + + # 5. EXECUTE PROCESS + try: + with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err: + process = subprocess.Popen( + full_cmd, + cwd=str(output_dir), + stdout=out, + stderr=err, + env=env, + ) + + # Write PID for all hooks + pid_file.write_text(str(process.pid)) + + if is_background: + # Background hook - return immediately, don't wait + return None + + # Foreground hook - wait for completion + try: + returncode = process.wait(timeout=effective_timeout) + except subprocess.TimeoutExpired: + process.kill() + process.wait() + returncode = -1 + with open(stderr_file, 'a') as err: + err.write(f'\nHook timed out after {effective_timeout}s') + + # 6. COLLECT RESULTS (foreground only) + end_ts = datetime.now(timezone.utc) + + stdout = stdout_file.read_text() if stdout_file.exists() else '' + stderr = stderr_file.read_text() if stderr_file.exists() else '' + + # Parse ALL JSONL output (any line with {type: 'ModelName', ...}) + records = [] + for line in stdout.splitlines(): + line = line.strip() + if not line or not line.startswith('{'): + continue + try: + data = json.loads(line) + if 'type' in data: + # Add plugin metadata to every record + plugin_name = script.parent.name # Directory name (e.g., 'wget') + data['plugin'] = plugin_name + data['plugin_hook'] = str(script.relative_to(Path.cwd())) + records.append(data) + except json.JSONDecodeError: + continue + + # 7. CLEANUP + # Delete empty logs (keep non-empty for debugging) + if stdout_file.exists() and stdout_file.stat().st_size == 0: + stdout_file.unlink() + if stderr_file.exists() and stderr_file.stat().st_size == 0: + stderr_file.unlink() + + # Delete ALL .pid files on success + if returncode == 0: + for pf in output_dir.glob('*.pid'): + pf.unlink(missing_ok=True) + + # 8. RETURN RECORDS + # Returns list of dicts, each with 'type' field and plugin metadata + return records + + except Exception as e: + # On error, return empty list (hook failed, no records created) + return [] +``` + +--- + +## Phase 6: Update ArchiveResult.run() + +**Note:** Only do this AFTER Phase 5 (run_hook() implementation) is complete. + +### Location: `archivebox/core/models.py` + +```python +def run(self): + """ + Execute this ArchiveResult's extractor and update status. + + For foreground hooks: Waits for completion and updates immediately + For background hooks: Returns immediately, leaves status='started' + + This method extends any ArchiveResult records from hook output with + computed fields (output_files, output_size, binary FK, etc.). + """ + from django.utils import timezone + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, find_binary_for_cmd, create_model_record + from archivebox.machine.models import Machine + + config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] + + # Find hook for this extractor + hook = None + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + matches = list(base_dir.glob(f'*/on_Snapshot__{self.extractor}.*')) + if matches: + hook = matches[0] + break + + if not hook: + self.status = self.StatusChoices.FAILED + self.output_str = f'No hook found for: {self.extractor}' + self.retry_at = None + self.save() + return + + # Use plugin directory name instead of extractor name + plugin_name = hook.parent.name + extractor_dir = Path(self.snapshot.output_dir) / plugin_name + + start_ts = timezone.now() + + # Run the hook (returns list of JSONL records) + records = run_hook( + hook, + output_dir=extractor_dir, + config_objects=config_objects, + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + ) + + # BACKGROUND HOOK - still running + if records is None: + self.status = self.StatusChoices.STARTED + self.start_ts = start_ts + self.pwd = str(extractor_dir) + self.save() + return + + # FOREGROUND HOOK - process records + end_ts = timezone.now() + + # Find the ArchiveResult record (enforce single output) + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) <= 1, f"Hook {hook} output {len(ar_records)} ArchiveResults, expected 0-1" + + if ar_records: + hook_data = ar_records[0] + + # Apply hook's data + status_str = hook_data.get('status', 'failed') + status_map = { + 'succeeded': self.StatusChoices.SUCCEEDED, + 'failed': self.StatusChoices.FAILED, + 'skipped': self.StatusChoices.SKIPPED, + } + self.status = status_map.get(status_str, self.StatusChoices.FAILED) + + self.output_str = hook_data.get('output_str', '') + self.output_json = hook_data.get('output_json') + + # Set extractor from plugin metadata + self.extractor = hook_data['plugin'] + + # Determine binary FK from cmd (ArchiveResult-specific logic) + if 'cmd' in hook_data: + self.cmd = json.dumps(hook_data['cmd']) + machine = Machine.current() + binary_id = find_binary_for_cmd(hook_data['cmd'], machine.id) + if binary_id: + self.binary_id = binary_id + else: + # No ArchiveResult output - hook didn't report, treat as failed + self.status = self.StatusChoices.FAILED + self.output_str = 'Hook did not output ArchiveResult' + + # Set timestamps and metadata + self.start_ts = start_ts + self.end_ts = end_ts + self.pwd = str(extractor_dir) + self.retry_at = None + + # POPULATE OUTPUT FIELDS FROM FILESYSTEM (ArchiveResult-specific) + if extractor_dir.exists(): + self._populate_output_fields(extractor_dir) + + self.save() + + # Create any side-effect records (Binary, Dependency, etc.) + for record in records: + if record['type'] != 'ArchiveResult': + create_model_record(record) # Generic helper that dispatches by type + + # Clean up empty output directory (no real files after excluding logs/pids) + if extractor_dir.exists(): + try: + # Check if only infrastructure files remain + remaining_files = [ + f for f in extractor_dir.rglob('*') + if f.is_file() and f.name not in ('stdout.log', 'stderr.log', 'hook.pid', 'listener.pid') + ] + if not remaining_files: + # Remove infrastructure files + for pf in extractor_dir.glob('*.log'): + pf.unlink(missing_ok=True) + for pf in extractor_dir.glob('*.pid'): + pf.unlink(missing_ok=True) + # Try to remove directory if empty + if not any(extractor_dir.iterdir()): + extractor_dir.rmdir() + except (OSError, RuntimeError): + pass + + # Queue discovered URLs, trigger indexing, etc. + self._queue_urls_for_crawl(extractor_dir) + + if self.status == self.StatusChoices.SUCCEEDED: + # Update snapshot title if this is title extractor + extractor_name = get_extractor_name(self.extractor) + if extractor_name == 'title': + self._update_snapshot_title(extractor_dir) + + # Trigger search indexing + self.trigger_search_indexing() + + +def _populate_output_fields(self, output_dir: Path) -> None: + """ + Walk output directory and populate output_files, output_size, output_mimetypes fields. + + Args: + output_dir: Directory containing output files + """ + import mimetypes + from collections import defaultdict + + exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} + + # Track mimetypes and sizes for aggregation + mime_sizes = defaultdict(int) + total_size = 0 + output_files = {} # Dict keyed by relative path + + for file_path in output_dir.rglob('*'): + # Skip non-files and infrastructure files + if not file_path.is_file(): + continue + if file_path.name in exclude_names: + continue + + # Get file stats + stat = file_path.stat() + mime_type, _ = mimetypes.guess_type(str(file_path)) + mime_type = mime_type or 'application/octet-stream' + + # Track for ArchiveResult fields + relative_path = str(file_path.relative_to(output_dir)) + output_files[relative_path] = {} # Empty dict, extensible for future metadata + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + + # Populate ArchiveResult fields + self.output_files = output_files # Dict preserves insertion order (Python 3.7+) + self.output_size = total_size + + # Build output_mimetypes CSV (sorted by size descending) + sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) + self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes) +``` + +### Querying output_files with Django + +Since `output_files` is a dict keyed by relative path, you can use Django's JSON field lookups: + +```python +# Check if a specific file exists +ArchiveResult.objects.filter(output_files__has_key='index.html') + +# Check if any of multiple files exist (OR) +from django.db.models import Q +ArchiveResult.objects.filter( + Q(output_files__has_key='index.html') | + Q(output_files__has_key='index.htm') +) + +# Get all results that have favicon +ArchiveResult.objects.filter(output_files__has_key='favicon.ico') + +# Check in Python (after fetching) +if 'index.html' in archiveresult.output_files: + print("Found index.html") + +# Get list of all paths +paths = list(archiveresult.output_files.keys()) + +# Count files +file_count = len(archiveresult.output_files) + +# Future: When we add metadata, query still works +# output_files = {'index.html': {'size': 4096, 'hash': 'abc...'}} +ArchiveResult.objects.filter(output_files__index_html__size__gt=1000) # size > 1KB +``` + +**Structure for Future Extension:** + +Current (empty metadata): +```python +{ + 'index.html': {}, + 'style.css': {}, + 'images/logo.png': {} +} +``` + +Future (with optional metadata): +```python +{ + 'index.html': { + 'size': 4096, + 'hash': 'abc123...', + 'mime_type': 'text/html' + }, + 'style.css': { + 'size': 2048, + 'hash': 'def456...', + 'mime_type': 'text/css' + } +} +``` + +All existing queries continue to work unchanged - the dict structure is backward compatible. + +--- + +## Phase 7: Background Hook Support + +This phase adds support for long-running background hooks that don't block other extractors. + +### 7.1 Background Hook Detection + +Background hooks are identified by `.bg.` suffix in filename: +- `on_Snapshot__21_consolelog.bg.js` ← background +- `on_Snapshot__11_favicon.js` ← foreground + +### 7.2 Rename Background Hooks + +**Files to rename:** + +```bash +# Use .bg. suffix (not __background) +mv archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js \ + archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js + +mv archivebox/plugins/ssl/on_Snapshot__23_ssl.js \ + archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js + +mv archivebox/plugins/responses/on_Snapshot__24_responses.js \ + archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +``` + +**Update hook content to emit proper JSON:** + +Each hook should emit: +```javascript +console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', // or 'failed' or 'skipped' + output_str: 'Captured 15 console messages', // human-readable summary + output_json: { // optional structured metadata + // ... specific to each hook + } +})); +``` + +### 7.3 Finalization Helper Functions + +Location: `archivebox/core/models.py` or new `archivebox/core/background_hooks.py` + +```python +def find_background_hooks(snapshot) -> List['ArchiveResult']: + """ + Find all ArchiveResults that are background hooks still running. + + Args: + snapshot: Snapshot instance + + Returns: + List of ArchiveResults with status='started' + """ + return list(snapshot.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.STARTED + )) + + +def check_background_hook_completed(archiveresult: 'ArchiveResult') -> bool: + """ + Check if background hook process has exited. + + Args: + archiveresult: ArchiveResult instance + + Returns: + True if completed (process exited), False if still running + """ + extractor_dir = Path(archiveresult.pwd) + pid_file = extractor_dir / 'hook.pid' + + if not pid_file.exists(): + return True # No PID file = completed or failed to start + + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, 0) # Signal 0 = check if process exists + return False # Still running + except (OSError, ValueError): + return True # Process exited or invalid PID + + +def finalize_background_hook(archiveresult: 'ArchiveResult') -> None: + """ + Collect final results from completed background hook. + + Same logic as ArchiveResult.run() but for background hooks that already started. + + Args: + archiveresult: ArchiveResult instance to finalize + """ + from django.utils import timezone + from archivebox.machine.models import Machine + + extractor_dir = Path(archiveresult.pwd) + stdout_file = extractor_dir / 'stdout.log' + stderr_file = extractor_dir / 'stderr.log' + + # Read logs + stdout = stdout_file.read_text() if stdout_file.exists() else '' + + # Parse JSONL output (same as run_hook) + records = [] + for line in stdout.splitlines(): + line = line.strip() + if not line or not line.startswith('{'): + continue + try: + data = json.loads(line) + if 'type' in data: + records.append(data) + except json.JSONDecodeError: + continue + + # Find the ArchiveResult record + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) <= 1, f"Background hook output {len(ar_records)} ArchiveResults, expected 0-1" + + if ar_records: + hook_data = ar_records[0] + + # Apply hook's data + status_str = hook_data.get('status', 'failed') + status_map = { + 'succeeded': ArchiveResult.StatusChoices.SUCCEEDED, + 'failed': ArchiveResult.StatusChoices.FAILED, + 'skipped': ArchiveResult.StatusChoices.SKIPPED, + } + archiveresult.status = status_map.get(status_str, ArchiveResult.StatusChoices.FAILED) + + archiveresult.output_str = hook_data.get('output_str', '') + archiveresult.output_json = hook_data.get('output_json') + + # Determine binary FK from cmd + if 'cmd' in hook_data: + archiveresult.cmd = json.dumps(hook_data['cmd']) + machine = Machine.current() + binary_id = find_binary_for_cmd(hook_data['cmd'], machine.id) + if binary_id: + archiveresult.binary_id = binary_id + else: + # No output = failed + archiveresult.status = ArchiveResult.StatusChoices.FAILED + archiveresult.output_str = 'Background hook did not output ArchiveResult' + + archiveresult.end_ts = timezone.now() + archiveresult.retry_at = None + + # POPULATE OUTPUT FIELDS FROM FILESYSTEM + if extractor_dir.exists(): + archiveresult._populate_output_fields(extractor_dir) + + archiveresult.save() + + # Create any side-effect records + for record in records: + if record['type'] != 'ArchiveResult': + create_model_record(record) + + # Cleanup + for pf in extractor_dir.glob('*.pid'): + pf.unlink(missing_ok=True) + if stdout_file.exists() and stdout_file.stat().st_size == 0: + stdout_file.unlink() + if stderr_file.exists() and stderr_file.stat().st_size == 0: + stderr_file.unlink() +``` + +### 7.4 Update SnapshotMachine + +Location: `archivebox/core/statemachines.py` + +```python +class SnapshotMachine(StateMachine, strict_states=True): + # ... existing states ... + + def is_finished(self) -> bool: + """ + Check if snapshot archiving is complete. + + A snapshot is finished when: + 1. No pending archiveresults remain (queued/started foreground hooks) + 2. All background hooks have completed + """ + # Check if any pending archiveresults exist + if self.snapshot.pending_archiveresults().exists(): + return False + + # Check and finalize background hooks + background_hooks = find_background_hooks(self.snapshot) + for bg_hook in background_hooks: + if not check_background_hook_completed(bg_hook): + return False # Still running + + # Completed - finalize it + finalize_background_hook(bg_hook) + + # All done + return True +``` + +### 7.5 Deduplication + +Deduplication is handled by external filesystem tools like `fdupes` (hardlinks), ZFS dedup, Btrfs duperemove, or rdfind. Users can run these tools periodically on the archive directory to identify and link duplicate files. ArchiveBox doesn't need to track hashes or manage deduplication itself - the filesystem layer handles it transparently. + +--- + +## Testing Strategy + +### 1. Unit Tests + +```python +# tests/test_background_hooks.py + +def test_background_hook_detection(): + """Test .bg. suffix detection""" + assert is_background_hook(Path('on_Snapshot__21_test.bg.js')) + assert not is_background_hook(Path('on_Snapshot__21_test.js')) + +def test_find_binary_by_abspath(): + """Test binary matching by absolute path""" + machine = Machine.current() + binary = Binary.objects.create( + name='wget', + abspath='/usr/bin/wget', + machine=machine + ) + + cmd = ['/usr/bin/wget', '-p', 'url'] + assert find_binary_for_cmd(cmd, machine.id) == str(binary.id) + +def test_find_binary_by_name(): + """Test binary matching by name fallback""" + machine = Machine.current() + binary = Binary.objects.create( + name='wget', + abspath='/usr/local/bin/wget', + machine=machine + ) + + cmd = ['wget', '-p', 'url'] + assert find_binary_for_cmd(cmd, machine.id) == str(binary.id) + +def test_parse_hook_json(): + """Test JSON parsing from stdout""" + stdout = ''' + Some log output + {"type": "ArchiveResult", "status": "succeeded", "output_str": "test"} + More output + ''' + result = parse_hook_output_json(stdout) + assert result['status'] == 'succeeded' + assert result['output_str'] == 'test' +``` + +### 2. Integration Tests + +```python +def test_foreground_hook_execution(snapshot): + """Test foreground hook runs and returns results""" + ar = ArchiveResult.objects.create( + snapshot=snapshot, + extractor='11_favicon', + status=ArchiveResult.StatusChoices.QUEUED + ) + + ar.run() + ar.refresh_from_db() + + assert ar.status in [ + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED + ] + assert ar.start_ts is not None + assert ar.end_ts is not None + assert ar.output_size >= 0 + +def test_background_hook_execution(snapshot): + """Test background hook starts but doesn't block""" + ar = ArchiveResult.objects.create( + snapshot=snapshot, + extractor='21_consolelog', + status=ArchiveResult.StatusChoices.QUEUED + ) + + start = time.time() + ar.run() + duration = time.time() - start + + ar.refresh_from_db() + + # Should return quickly (< 5 seconds) + assert duration < 5 + # Should be in 'started' state + assert ar.status == ArchiveResult.StatusChoices.STARTED + # PID file should exist + assert (Path(ar.pwd) / 'hook.pid').exists() + +def test_background_hook_finalization(snapshot): + """Test background hook finalization after completion""" + # Start background hook + ar = ArchiveResult.objects.create( + snapshot=snapshot, + extractor='21_consolelog', + status=ArchiveResult.StatusChoices.STARTED, + pwd='/path/to/output' + ) + + # Simulate completion (hook writes output and exits) + # ... + + # Finalize + finalize_background_hook(ar) + ar.refresh_from_db() + + assert ar.status == ArchiveResult.StatusChoices.SUCCEEDED + assert ar.end_ts is not None + assert ar.output_size > 0 +``` + +--- + +## Migration Path + +### Step 1: Create migration +```bash +cd archivebox +python manage.py makemigrations core --name archiveresult_background_hooks +``` + +### Step 2: **Plugin standardization (Phase 4)** +- Update ALL plugins to new JSONL format FIRST +- Test each plugin as you update it +- This ensures old run_hook() can still work during transition + +### Step 3: Update run_hook() (Phase 5) +- Add background hook detection +- Add log file capture +- Parse JSONL output (any line with {type: 'ModelName', ...}) +- Add plugin and plugin_hook metadata to each record + +### Step 4: Update ArchiveResult.run() (Phase 6) +- Handle None result for background hooks (return immediately) +- Parse records list from run_hook() +- Assert only one ArchiveResult record per hook +- Extend ArchiveResult record with computed fields (output_files, output_size, binary FK) +- Call `_populate_output_fields()` to walk directory and populate summary fields +- Call `create_model_record()` for any side-effect records (Binary, etc.) + +### Step 5: Add finalization helpers (Phase 7) +- `find_background_hooks()` +- `check_background_hook_completed()` +- `finalize_background_hook()` + +### Step 6: Update SnapshotMachine.is_finished() (Phase 7) +- Check for background hooks +- Finalize completed ones + +### Step 7: Rename background hooks (Phase 7) +- Rename 3 background hooks with .bg. suffix + +### Step 8: Test +- Unit tests +- Integration tests +- Manual testing with real snapshots + +--- + +## Success Criteria + +- ✅ Background hooks start immediately without blocking other extractors +- ✅ Background hooks are finalized after completion with full results +- ✅ All output stats calculated by runner, not hooks +- ✅ Binary FK optional and only set when determinable +- ✅ Clean separation between output_str (human) and output_json (structured) +- ✅ output_files stored as dict for easy querying and future extensibility +- ✅ Log files cleaned up on success, kept on failure +- ✅ PID files cleaned up after completion +- ✅ No plugin-specific code in core (generic polling mechanism) +- ✅ All plugins updated to clean JSONL format +- ✅ Safe incremental rollout (plugins first, then core code) + +--- + +## Future Enhancements + +### 1. Timeout for orphaned background hooks +If a background hook runs longer than MAX_LIFETIME after all foreground hooks complete, force kill it. + +### 2. Progress reporting +Background hooks could write progress to a file that gets polled: +```javascript +fs.writeFileSync('progress.txt', '50%'); +``` + +### 3. Multiple results per hook +If needed in future, extend to support multiple JSON outputs by collecting all `{type: 'ArchiveResult'}` lines. + +### 4. Dependency tracking +Store all binaries used by a hook (not just primary), useful for hooks that chain multiple tools. + +### 5. Per-file metadata in output_files +If needed, extend output_files values to include per-file metadata: +```python +output_files = { + 'index.html': { + 'size': 4096, + 'hash': 'abc123...', + 'mime_type': 'text/html', + 'modified_at': '2025-01-15T10:30:00Z' + } +} +``` +Can query with custom SQL for complex per-file queries (e.g., "find all results with any file > 50KB"). Summary fields (output_size, output_mimetypes) remain as denormalized cache for performance. + +--- + +# Hook Architecture Implementation Report + +## Date: 2025-12-27 + +## Summary + +This report documents the Phase 4 plugin audit and Phase 1-7 implementation work. + +--- + +## Implementation Status + +### ✅ Phase 1: Database Migration (COMPLETE) + +Created migrations: +- `archivebox/core/migrations/0029_archiveresult_hook_fields.py` - Adds new fields +- `archivebox/core/migrations/0030_migrate_output_field.py` - Migrates old `output` field + +New ArchiveResult fields: +- [x] `output_str` (TextField) - human-readable summary +- [x] `output_json` (JSONField) - structured metadata +- [x] `output_files` (JSONField) - dict of {relative_path: {}} +- [x] `output_size` (BigIntegerField) - total bytes +- [x] `output_mimetypes` (CharField) - CSV of mimetypes sorted by size +- [x] `binary` (ForeignKey to Binary) - optional + +### ✅ Phase 3: Generic run_hook() (COMPLETE) + +Updated `archivebox/hooks.py`: +- [x] Parse JSONL output (any line with `{type: 'ModelName', ...}`) +- [x] Backwards compatible with `RESULT_JSON=` format +- [x] Add plugin metadata to each record +- [x] Detect background hooks with `.bg.` suffix +- [x] Added `find_binary_for_cmd()` helper +- [x] Added `create_model_record()` for Binary/Machine + +### ✅ Phase 6: Update ArchiveResult.run() (COMPLETE) + +Updated `archivebox/core/models.py`: +- [x] Handle background hooks (return immediately when result is None) +- [x] Process `records` from HookResult +- [x] Use new output fields +- [x] Added `_populate_output_fields()` method +- [x] Added `_set_binary_from_cmd()` method +- [x] Call `create_model_record()` for side-effect records + +### ✅ Phase 7: Background Hook Support (COMPLETE) + +Added to `archivebox/core/models.py`: +- [x] `is_background_hook()` method +- [x] `check_background_completed()` method +- [x] `finalize_background_hook()` method + +Updated `archivebox/core/statemachines.py`: +- [x] `SnapshotMachine.is_finished()` checks/finalizes background hooks + +--- + +## Phase 4: Plugin Audit + +### Dependency Hooks (on_Dependency__*) - ALL COMPLIANT ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| apt | `on_Dependency__install_using_apt_provider.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | +| brew | `on_Dependency__install_using_brew_provider.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | +| custom | `on_Dependency__install_using_custom_bash.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | +| env | `on_Dependency__install_using_env_provider.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | +| npm | `on_Dependency__install_using_npm_provider.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | +| pip | `on_Dependency__install_using_pip_provider.py` | ✅ OK | Emits `{type: 'Binary'}` JSONL | + +### Crawl Install Hooks (on_Crawl__00_install_*) - ALL RENAMED ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| chrome_session | `on_Crawl__00_install_chrome.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| chrome_session | `on_Crawl__00_install_chrome_config.py` | ✅ RENAMED | Emits config JSONL | +| wget | `on_Crawl__00_install_wget.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| wget | `on_Crawl__00_install_wget_config.py` | ✅ RENAMED | Emits config JSONL | +| singlefile | `on_Crawl__00_install_singlefile.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| readability | `on_Crawl__00_install_readability.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| media | `on_Crawl__00_install_ytdlp.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| git | `on_Crawl__00_install_git.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| forumdl | `on_Crawl__00_install_forumdl.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| gallerydl | `on_Crawl__00_install_gallerydl.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| mercury | `on_Crawl__00_install_mercury.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| papersdl | `on_Crawl__00_install_papersdl.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | +| search_backend_ripgrep | `on_Crawl__00_install_ripgrep.py` | ✅ RENAMED | Emits Binary/Dependency JSONL | + +### Snapshot Hooks (on_Snapshot__*) - Python Hooks UPDATED ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL | +| git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| archivedotorg | `on_Snapshot__13_archivedotorg.py` | ✅ UPDATED | Now outputs clean JSONL | +| title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL | +| singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| media | `on_Snapshot__51_media.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| readability | `on_Snapshot__52_readability.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | + +### Snapshot Hooks - JavaScript Hooks UPDATED ✅ + +All JS hooks have been updated to use clean JSONL format: + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| chrome_session | `on_Snapshot__20_chrome_session.js` | ✅ UPDATED | Clean JSONL with cmd_version | +| consolelog | `on_Snapshot__21_consolelog.bg.js` | ✅ UPDATED | Renamed to background hook | +| ssl | `on_Snapshot__23_ssl.bg.js` | ✅ UPDATED | Renamed to background hook | +| responses | `on_Snapshot__24_responses.bg.js` | ✅ UPDATED | Renamed to background hook | +| chrome_navigate | `on_Snapshot__30_chrome_navigate.js` | ✅ UPDATED | Clean JSONL output | +| redirects | `on_Snapshot__31_redirects.js` | ✅ UPDATED | Clean JSONL output | +| title | `on_Snapshot__32_title.js` | ✅ UPDATED | Clean JSONL output | +| headers | `on_Snapshot__33_headers.js` | ✅ UPDATED | Clean JSONL output | +| screenshot | `on_Snapshot__34_screenshot.js` | ✅ UPDATED | Clean JSONL output | +| pdf | `on_Snapshot__35_pdf.js` | ✅ UPDATED | Clean JSONL output | +| dom | `on_Snapshot__36_dom.js` | ✅ UPDATED | Clean JSONL output | +| seo | `on_Snapshot__38_seo.js` | ✅ UPDATED | Clean JSONL output | +| accessibility | `on_Snapshot__39_accessibility.js` | ✅ UPDATED | Clean JSONL output | +| parse_dom_outlinks | `on_Snapshot__40_parse_dom_outlinks.js` | ✅ UPDATED | Clean JSONL output | + +### Background Hooks Renamed ✅ + +The following hooks have been renamed with `.bg.` suffix: + +- `on_Snapshot__21_consolelog.js` → `on_Snapshot__21_consolelog.bg.js` +- `on_Snapshot__23_ssl.js` → `on_Snapshot__23_ssl.bg.js` +- `on_Snapshot__24_responses.js` → `on_Snapshot__24_responses.bg.js` + +--- + +## Files Modified + +### Core Infrastructure +- `archivebox/hooks.py` - Updated run_hook() and added helpers +- `archivebox/core/models.py` - Updated ArchiveResult model and run() method +- `archivebox/core/statemachines.py` - Updated SnapshotMachine.is_finished() +- `archivebox/core/admin_archiveresults.py` - Updated to use output_str +- `archivebox/core/templatetags/core_tags.py` - Updated to use output_str + +### Migrations +- `archivebox/core/migrations/0029_archiveresult_hook_fields.py` (new) +- `archivebox/core/migrations/0030_migrate_output_field.py` (new) + +### Plugins Updated (Python Hooks) +- `archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py` +- `archivebox/plugins/favicon/on_Snapshot__11_favicon.py` +- `archivebox/plugins/git/on_Snapshot__12_git.py` +- `archivebox/plugins/media/on_Snapshot__51_media.py` +- `archivebox/plugins/readability/on_Snapshot__52_readability.py` +- `archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py` +- `archivebox/plugins/wget/on_Snapshot__50_wget.py` + +### Plugins Updated (JavaScript Hooks) +- `archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js` +- `archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js` (renamed) +- `archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js` (renamed) +- `archivebox/plugins/responses/on_Snapshot__24_responses.bg.js` (renamed) +- `archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js` +- `archivebox/plugins/redirects/on_Snapshot__31_redirects.js` +- `archivebox/plugins/title/on_Snapshot__32_title.js` +- `archivebox/plugins/headers/on_Snapshot__33_headers.js` +- `archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js` +- `archivebox/plugins/pdf/on_Snapshot__35_pdf.js` +- `archivebox/plugins/dom/on_Snapshot__36_dom.js` +- `archivebox/plugins/seo/on_Snapshot__38_seo.js` +- `archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js` +- `archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js` + +--- + +## Remaining Work + +1. ~~**Update remaining JS hooks** (13 files) to output clean JSONL~~ ✅ DONE +2. ~~**Rename background hooks** with `.bg.` suffix~~ ✅ DONE +3. ~~**Write tests** for the hook architecture~~ ✅ DONE (31 tests in archivebox/tests/test_hooks.py) +4. ~~**Run migrations** and test on real data~~ ✅ DONE (migrations 0029 and 0030 applied successfully) + +## Completion Summary + +All phases of the hook architecture implementation are now complete: + +- ✅ Phase 1: Database Migration +- ✅ Phase 3: Generic run_hook() with JSONL parsing +- ✅ Phase 4: Plugin Audit (all 32 hooks updated) +- ✅ Phase 6: ArchiveResult.run() updated +- ✅ Phase 7: Background hook support + +Total hooks updated: **32 hooks** across 6 dependency providers, 13 install hooks (renamed from validate), 8 Python snapshot hooks, and 14 JS snapshot hooks (3 of which are background hooks). diff --git a/old/TODO_hook_concurrency.md b/old/TODO_hook_concurrency.md new file mode 100644 index 0000000000..c076cc7076 --- /dev/null +++ b/old/TODO_hook_concurrency.md @@ -0,0 +1,532 @@ +# ArchiveBox Hook Script Concurrency & Execution Plan + +## Overview + +Snapshot.run() should enforce that snapshot hooks are run in **10 discrete, sequential "steps"**: `0*`, `1*`, `2*`, `3*`, `4*`, `5*`, `6*`, `7*`, `8*`, `9*`. + +For every discovered hook script, ArchiveBox should create an ArchiveResult in `queued` state, then manage running them using `retry_at` and inline logic to enforce this ordering. + +## Design Decisions + +### ArchiveResult Schema +- Add `ArchiveResult.hook_name` (CharField, nullable) - just filename, e.g., `'on_Snapshot__20_chrome_tab.bg.js'` +- Keep `ArchiveResult.plugin` - still important (plugin directory name) +- Step number derived on-the-fly from `hook_name` via `extract_step(hook_name)` - not stored + +### Snapshot Schema +- Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- Integrate with `SnapshotMachine` state transitions for step advancement + +### Hook Discovery & Execution +- `Snapshot.run()` discovers all hooks upfront, creates one AR per hook with `hook_name` set +- All ARs for a given step can be claimed and executed in parallel by workers +- Workers claim ARs where `extract_step(ar.hook_name) <= snapshot.current_step` +- `Snapshot.advance_step_if_ready()` increments `current_step` when: + - All **foreground** hooks in current step are finished (SUCCEEDED/FAILED/SKIPPED) + - Background hooks don't block advancement (they continue running) + - Called from `SnapshotMachine` state transitions + +### ArchiveResult.run() Behavior +- If `self.hook_name` is set: run that single hook +- If `self.hook_name` is None: discover all hooks for `self.plugin` and run sequentially +- Background hooks detected by `.bg.` in filename (e.g., `on_Snapshot__20_chrome_tab.bg.js`) +- Background hooks return immediately (ArchiveResult stays in STARTED state) +- Foreground hooks wait for completion, update status from JSONL output + +### Hook Execution Flow +1. **Within a step**: Workers claim all ARs for current step in parallel +2. **Foreground hooks** (no .bg): ArchiveResult waits for completion, transitions to SUCCEEDED/FAILED/SKIPPED +3. **Background hooks** (.bg): ArchiveResult transitions to STARTED, hook continues running +4. **Step advancement**: `Snapshot.advance_step_if_ready()` checks: + - Are all foreground ARs in current step finished? (SUCCEEDED/FAILED/SKIPPED) + - Ignore ARs still in STARTED (background hooks) + - If yes, increment `current_step` +5. **Snapshot sealing**: When `current_step=9` and all foreground hooks done, kill background hooks via `Snapshot.cleanup()` + +### Unnumbered Hooks +- Extract step via `re.search(r'__(\d{2})_', hook_name)`, default to 9 if no match +- Log warning for unnumbered hooks +- Purely runtime derivation - no stored field + +## Hook Numbering Convention + +Hooks scripts are numbered `00` to `99` to control: +- **First digit (0-9)**: Which step they are part of +- **Second digit (0-9)**: Order within that step + +Hook scripts are launched **strictly sequentially** based on their filename alphabetical order, and run in sets of several per step before moving on to the next step. + +**Naming Format:** +``` +on_{ModelName}__{run_order}_{human_readable_description}[.bg].{ext} +``` + +**Examples:** +``` +on_Snapshot__00_this_would_run_first.sh +on_Snapshot__05_start_ytdlp_download.bg.sh +on_Snapshot__10_chrome_tab_opened.js +on_Snapshot__50_screenshot.js +on_Snapshot__53_media.bg.py +``` + +## Background (.bg) vs Foreground Scripts + +### Foreground Scripts (no .bg suffix) +- Launch in parallel with other hooks in their step +- Step waits for all foreground hooks to complete or timeout +- Get killed with SIGTERM if they exceed their `PLUGINNAME_TIMEOUT` +- Step advances when all foreground hooks finish + +### Background Scripts (.bg suffix) +- Launch in parallel with other hooks in their step +- Do NOT block step progression - step can advance while they run +- Continue running across step boundaries until complete or timeout +- Get killed with SIGTERM when Snapshot transitions to SEALED (via `Snapshot.cleanup()`) +- Should exit naturally when work is complete (best case) + +**Important:** A .bg script started in step 2 can keep running through steps 3, 4, 5... until the Snapshot seals or the hook exits naturally. + +## Execution Step Guidelines + +These are **naming conventions and guidelines**, not enforced checkpoints. They provide semantic organization for plugin ordering: + +### Step 0: Pre-Setup +``` +00-09: Initial setup, validation, feature detection +``` + +### Step 1: Chrome Launch & Tab Creation +``` +10-19: Browser/tab lifecycle setup +- Chrome browser launch +- Tab creation and CDP connection +``` + +### Step 2: Navigation & Settlement +``` +20-29: Page loading and settling +- Navigate to URL +- Wait for page load +- Initial response capture (responses, ssl, consolelog as .bg listeners) +``` + +### Step 3: Page Adjustment +``` +30-39: DOM manipulation before archiving +- Hide popups/banners +- Solve captchas +- Expand comments/details sections +- Inject custom CSS/JS +- Accessibility modifications +``` + +### Step 4: Ready for Archiving +``` +40-49: Final pre-archiving checks +- Verify page is fully adjusted +- Wait for any pending modifications +``` + +### Step 5: DOM Extraction (Sequential, Non-BG) +``` +50-59: Extractors that need exclusive DOM access +- singlefile (MUST NOT be .bg) +- screenshot (MUST NOT be .bg) +- pdf (MUST NOT be .bg) +- dom (MUST NOT be .bg) +- title +- headers +- readability +- mercury + +These MUST run sequentially as they temporarily modify the DOM +during extraction, then revert it. Running in parallel would corrupt results. +``` + +### Step 6: Post-DOM Extraction +``` +60-69: Extractors that don't need DOM or run on downloaded files +- wget +- git +- media (.bg - can run for hours) +- gallerydl (.bg) +- forumdl (.bg) +- papersdl (.bg) +``` + +### Step 7: Chrome Cleanup +``` +70-79: Browser/tab teardown +- Close tabs +- Cleanup Chrome resources +``` + +### Step 8: Post-Processing +``` +80-89: Reprocess outputs from earlier extractors +- OCR of images +- Audio/video transcription +- URL parsing from downloaded content (rss, html, json, txt, csv, md) +- LLM analysis/summarization of outputs +``` + +### Step 9: Indexing & Finalization +``` +90-99: Save to indexes and finalize +- Index text content to Sonic/SQLite FTS +- Create symlinks +- Generate merkle trees +- Final status updates +``` + +## Hook Script Interface + +### Input: CLI Arguments (NOT stdin) +Hooks receive configuration as CLI flags (CSV or JSON-encoded): + +```bash +--url="https://example.com" +--snapshot-id="1234-5678-uuid" +--config='{"some_key": "some_value"}' +--plugins=git,media,favicon,title +--timeout=50 +--enable-something +``` + +### Input: Environment Variables +All configuration comes from env vars, defined in `plugin_dir/config.json` JSONSchema: + +```bash +WGET_BINARY=/usr/bin/wget +WGET_TIMEOUT=60 +WGET_USER_AGENT="Mozilla/5.0..." +WGET_EXTRA_ARGS="--no-check-certificate" +SAVE_WGET=True +``` + +**Required:** Every plugin must support `PLUGINNAME_TIMEOUT` for self-termination. + +### Output: Filesystem (CWD) +Hooks read/write files to: +- `$CWD`: Their own output subdirectory (e.g., `archive/snapshots/{id}/wget/`) +- `$CWD/..`: Parent directory (to read outputs from other hooks) + +This allows hooks to: +- Access files created by other hooks +- Keep their outputs separate by default +- Use semaphore files for coordination (if needed) + +### Output: JSONL to stdout +Hooks emit one JSONL line per database record they want to create or update: + +```jsonl +{"type": "Tag", "name": "sci-fi"} +{"type": "ArchiveResult", "id": "1234-uuid", "status": "succeeded", "output_str": "wget/index.html"} +{"type": "Snapshot", "id": "5678-uuid", "title": "Example Page"} +``` + +See `archivebox/misc/jsonl.py` and model `from_json()` / `from_jsonl()` methods for full list of supported types and fields. + +### Output: stderr for Human Logs +Hooks should emit human-readable output or debug info to **stderr**. There are no guarantees this will be persisted long-term. Use stdout JSONL or filesystem for outputs that matter. + +### Cleanup: Delete Cruft +If hooks emit no meaningful long-term outputs, they should delete any temporary files themselves to avoid wasting space. However, the ArchiveResult DB row should be kept so we know: +- It doesn't need to be retried +- It isn't missing +- What happened (status, error message) + +### Signal Handling: SIGINT/SIGTERM +Hooks are expected to listen for polite `SIGINT`/`SIGTERM` and finish hastily, then exit cleanly. Beyond that, they may be `SIGKILL'd` at ArchiveBox's discretion. + +**If hooks double-fork or spawn long-running processes:** They must output a `.pid` file in their directory so zombies can be swept safely. + +## Hook Failure Modes & Retry Logic + +Hooks can fail in several ways. ArchiveBox handles each differently: + +### 1. Soft Failure (Record & Don't Retry) +**Exit:** `0` (success) +**JSONL:** `{"type": "ArchiveResult", "status": "failed", "output_str": "404 Not Found"}` + +This means: "I ran successfully, but the resource wasn't available." Don't retry this. + +**Use cases:** +- 404 errors +- Content not available +- Feature not applicable to this URL + +### 2. Hard Failure / Temporary Error (Retry Later) +**Exit:** Non-zero (1, 2, etc.) +**JSONL:** None (or incomplete) + +This means: "Something went wrong, I couldn't complete." Treat this ArchiveResult as "missing" and set `retry_at` for later. + +**Use cases:** +- 500 server errors +- Network timeouts +- Binary not found / crashed +- Transient errors + +**Behavior:** +- ArchiveBox sets `retry_at` on the ArchiveResult +- Hook will be retried during next `archivebox update` + +### 3. Partial Success (Update & Continue) +**Exit:** Non-zero +**JSONL:** Partial records emitted before crash + +**Behavior:** +- Update ArchiveResult with whatever was emitted +- Mark remaining work as "missing" with `retry_at` + +### 4. Success (Record & Continue) +**Exit:** `0` +**JSONL:** `{"type": "ArchiveResult", "status": "succeeded", "output_str": "output/file.html"}` + +This is the happy path. + +### Error Handling Rules + +- **DO NOT skip hooks** based on failures +- **Continue to next hook** regardless of foreground or background failures +- **Update ArchiveResults** with whatever information is available +- **Set retry_at** for "missing" or temporarily-failed hooks +- **Let background scripts continue** even if foreground scripts fail + +## File Structure + +``` +archivebox/plugins/{plugin_name}/ +├── config.json # JSONSchema: env var config options +├── binaries.jsonl # Runtime dependencies: apt|brew|pip|npm|env +├── on_Snapshot__XX_name.py # Hook script (foreground) +├── on_Snapshot__XX_name.bg.py # Hook script (background) +└── tests/ + └── test_name.py +``` + +## Implementation Checklist + +### Phase 1: Schema Migration ✅ +- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename +- [x] Create migration: `0034_snapshot_current_step.py` + +### Phase 2: Core Logic Updates ✅ +- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` + - Extract first digit from `__XX_` pattern + - Default to 9 for unnumbered hooks +- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py` + - Check for `.bg.` in filename +- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: + - Discover all hooks (not plugins) + - Create one AR per hook with `hook_name` set +- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`: + - If `hook_name` set: run single hook + - If `hook_name` None: discover all plugin hooks (existing behavior) +- [x] Add `Snapshot.advance_step_if_ready()` method: + - Check if all foreground ARs in current step finished + - Increment `current_step` if ready + - Ignore background hooks (.bg) in completion check +- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: + - Call `advance_step_if_ready()` before checking if done + +### Phase 3: Worker Coordination ✅ +- [x] Update worker AR claiming query in `archivebox/workers/worker.py`: + - Filter: `extract_step(ar.hook_name) <= snapshot.current_step` + - Claims ARs in QUEUED state, checks step in Python before processing + - Orders by hook_name for deterministic execution within step + +### Phase 4: Hook Renumbering ✅ +- [x] Renumber hooks per renumbering map below +- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl) +- [x] Move parse_* hooks to step 7 (70-79) +- [x] Test all hooks still work after renumbering + +## Migration Path + +### Natural Compatibility +No special migration needed: +1. Existing ARs with `hook_name=None` continue to work (discover all plugin hooks at runtime) +2. New ARs get `hook_name` set (single hook per AR) +3. `ArchiveResult.run()` handles both cases naturally +4. Unnumbered hooks default to step 9 (log warning) + +### Renumbering Map + +**Completed Renames:** +``` +# Step 5: DOM Extraction (sequential, non-background) +singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py ✅ +screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js ✅ +pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js ✅ +dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js ✅ +title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js ✅ +readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py ✅ +headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js ✅ +mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py ✅ +htmltotext/on_Snapshot__54_htmltotext.py → htmltotext/on_Snapshot__57_htmltotext.py ✅ + +# Step 6: Post-DOM Extraction (background for long-running) +wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py ✅ +git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py ✅ +media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py ✅ +gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅ +forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py ✅ +papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py ✅ + +# Step 7: URL Extraction (parse_* hooks moved from step 6) +parse_html_urls/on_Snapshot__60_parse_html_urls.py → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅ +parse_txt_urls/on_Snapshot__62_parse_txt_urls.py → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅ +parse_rss_urls/on_Snapshot__61_parse_rss_urls.py → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅ +parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅ +parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅ +parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅ +``` + +## Testing Strategy + +### Unit Tests +- Test hook ordering (00-99) +- Test step grouping (first digit) +- Test .bg vs foreground execution +- Test timeout enforcement +- Test JSONL parsing +- Test failure modes & retry_at logic + +### Integration Tests +- Test full Snapshot.run() with mixed hooks +- Test .bg scripts running beyond step 99 +- Test zombie process cleanup +- Test graceful SIGTERM handling +- Test concurrent .bg script coordination + +### Performance Tests +- Measure overhead of per-hook ArchiveResults +- Test with 50+ concurrent .bg scripts +- Test filesystem contention with many hooks + +## Open Questions + +### Q: Should we provide semaphore utilities? +**A:** No. Keep plugins decoupled. Let them use simple filesystem coordination if needed. + +### Q: What happens if ArchiveResult table gets huge? +**A:** We can delete old successful ArchiveResults periodically, or archive them to cold storage. The important data is in the filesystem outputs. + +### Q: Should naturally-exiting .bg scripts still be .bg? +**A:** Yes. The .bg suffix means "don't block step progression," not "run until step 99." Natural exit is the best case. + +## Examples + +### Foreground Hook (Sequential DOM Access) +```python +#!/usr/bin/env python3 +# archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js + +# Runs at step 5, blocks step progression until complete +# Gets killed if it exceeds SCREENSHOT_TIMEOUT + +timeout = get_env_int('SCREENSHOT_TIMEOUT') or get_env_int('TIMEOUT', 60) + +try: + result = subprocess.run(cmd, capture_output=True, timeout=timeout) + if result.returncode == 0: + print(json.dumps({ + "type": "ArchiveResult", + "status": "succeeded", + "output_str": "screenshot.png" + })) + sys.exit(0) + else: + # Temporary failure - will be retried + sys.exit(1) +except subprocess.TimeoutExpired: + # Timeout - will be retried + sys.exit(1) +``` + +### Background Hook (Long-Running Download) +```python +#!/usr/bin/env python3 +# archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py + +# Runs at step 6, doesn't block step progression +# Gets full YTDLP_TIMEOUT (e.g., 3600s) regardless of when step 99 completes + +timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600) + +try: + result = subprocess.run(['yt-dlp', url], capture_output=True, timeout=timeout) + if result.returncode == 0: + print(json.dumps({ + "type": "ArchiveResult", + "status": "succeeded", + "output_str": "media/" + })) + sys.exit(0) + else: + # Hard failure - don't retry + print(json.dumps({ + "type": "ArchiveResult", + "status": "failed", + "output_str": "Video unavailable" + })) + sys.exit(0) # Exit 0 to record the failure +except subprocess.TimeoutExpired: + # Timeout - will be retried + sys.exit(1) +``` + +### Background Hook with Natural Exit +```javascript +#!/usr/bin/env node +// archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js + +// Sets up listener, captures SSL info, then exits naturally +// No SIGTERM handler needed - already exits when done + +async function main() { + const page = await connectToChrome(); + + // Set up listener + page.on('response', async (response) => { + const securityDetails = response.securityDetails(); + if (securityDetails) { + fs.writeFileSync('ssl.json', JSON.stringify(securityDetails)); + } + }); + + // Wait for navigation (done by other hook) + await waitForNavigation(); + + // Emit result + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: 'ssl.json' + })); + + process.exit(0); // Natural exit - no await indefinitely +} + +main().catch(e => { + console.error(`ERROR: ${e.message}`); + process.exit(1); // Will be retried +}); +``` + +## Summary + +This plan provides: +- ✅ Clear execution ordering (10 steps, 00-99 numbering) +- ✅ Async support (.bg suffix) +- ✅ Independent timeout control per plugin +- ✅ Flexible failure handling & retry logic +- ✅ Streaming JSONL output for DB updates +- ✅ Simple filesystem-based coordination +- ✅ Backward compatibility during migration + +The main implementation work is refactoring `Snapshot.run()` to enforce step ordering and manage .bg script lifecycles. Plugin renumbering is straightforward mechanical work. diff --git a/old/TODO_hook_statemachine_cleanup.md b/old/TODO_hook_statemachine_cleanup.md new file mode 100644 index 0000000000..46297a743f --- /dev/null +++ b/old/TODO_hook_statemachine_cleanup.md @@ -0,0 +1,664 @@ +# Hook & State Machine Cleanup - Unified Pattern + +## Goal +Implement a **consistent pattern** across all models (Crawl, Snapshot, ArchiveResult, Dependency) for: +1. Running hooks +2. Processing JSONL records +3. Managing background hooks +4. State transitions + +## Current State Analysis (ALL COMPLETE ✅) + +### ✅ Crawl (archivebox/crawls/) +**Status**: COMPLETE +- ✅ Has state machine: `CrawlMachine` +- ✅ `Crawl.run()` - runs hooks, processes JSONL via `process_hook_records()`, creates snapshots +- ✅ `Crawl.cleanup()` - kills background hooks, runs on_CrawlEnd hooks +- ✅ Uses `OUTPUT_DIR/plugin_name/` for PWD +- ✅ State machine calls model methods: + - `queued -> started`: calls `crawl.run()` + - `started -> sealed`: calls `crawl.cleanup()` + +### ✅ Snapshot (archivebox/core/) +**Status**: COMPLETE +- ✅ Has state machine: `SnapshotMachine` +- ✅ `Snapshot.run()` - creates pending ArchiveResults +- ✅ `Snapshot.cleanup()` - kills background ArchiveResult hooks, calls `update_from_output()` +- ✅ `Snapshot.has_running_background_hooks()` - checks PID files using `process_is_alive()` +- ✅ `Snapshot.from_jsonl()` - simplified, filtering moved to caller +- ✅ State machine calls model methods: + - `queued -> started`: calls `snapshot.run()` + - `started -> sealed`: calls `snapshot.cleanup()` + - `is_finished()`: uses `has_running_background_hooks()` + +### ✅ ArchiveResult (archivebox/core/) +**Status**: COMPLETE - Major refactor completed +- ✅ Has state machine: `ArchiveResultMachine` +- ✅ `ArchiveResult.run()` - runs hook, calls `update_from_output()` for foreground hooks +- ✅ `ArchiveResult.update_from_output()` - unified method for foreground and background hooks +- ✅ Uses PWD `snapshot.OUTPUT_DIR/plugin_name` +- ✅ JSONL processing via `process_hook_records()` with URL/depth filtering +- ✅ **DELETED** special background hook methods: + - ❌ `check_background_completed()` - replaced by `process_is_alive()` helper + - ❌ `finalize_background_hook()` - replaced by `update_from_output()` + - ❌ `_populate_output_fields()` - merged into `update_from_output()` +- ✅ State machine transitions: + - `queued -> started`: calls `archiveresult.run()` + - `started -> succeeded/failed/skipped`: status set by `update_from_output()` + +### ✅ Binary (archivebox/machine/) - NEW! +**Status**: COMPLETE - Replaced Dependency model entirely +- ✅ Has state machine: `BinaryMachine` +- ✅ `Binary.run()` - runs on_Binary__install_* hooks, processes JSONL +- ✅ `Binary.cleanup()` - kills background installation hooks (for consistency) +- ✅ `Binary.from_jsonl()` - handles both binaries.jsonl and hook output +- ✅ Uses PWD `data/machines/{machine_id}/binaries/{name}/{id}/plugin_name/` +- ✅ Configuration via static `plugins/*/binaries.jsonl` files +- ✅ State machine calls model methods: + - `queued -> started`: calls `binary.run()` + - `started -> succeeded/failed`: status set by hooks via JSONL +- ✅ Perfect symmetry with Crawl/Snapshot/ArchiveResult pattern + +### ❌ Dependency Model - ELIMINATED +**Status**: Deleted entirely (replaced by Binary state machine) +- Static configuration now lives in `plugins/*/binaries.jsonl` +- Per-machine state tracked by Binary records +- No global singleton conflicts +- Hooks renamed from `on_Dependency__install_*` to `on_Binary__install_*` + +## Unified Pattern (Target Architecture) + +### Pattern for ALL models: + +```python +# 1. State Machine orchestrates transitions +class ModelMachine(StateMachine): + @started.enter + def enter_started(self): + self.model.run() # Do the work + # Update status + + def is_finished(self): + # Check if background hooks still running + if self.model.has_running_background_hooks(): + return False + # Check if children finished + if self.model.has_pending_children(): + return False + return True + + @sealed.enter + def enter_sealed(self): + self.model.cleanup() # Clean up background hooks + # Update status + +# 2. Model methods do the actual work +class Model: + def run(self): + """Run hooks, process JSONL, create children.""" + hooks = discover_hooks('ModelName') + for hook in hooks: + output_dir = self.OUTPUT_DIR / hook.parent.name + result = run_hook(hook, output_dir=output_dir, ...) + + if result is None: # Background hook + continue + + # Process JSONL records + records = result.get('records', []) + overrides = {'model': self, 'created_by_id': self.created_by_id} + process_hook_records(records, overrides=overrides) + + # Create children (e.g., ArchiveResults, Snapshots) + self.create_children() + + def cleanup(self): + """Kill background hooks, run cleanup hooks.""" + # Kill any background hooks + if self.OUTPUT_DIR.exists(): + for pid_file in self.OUTPUT_DIR.glob('*/hook.pid'): + kill_process(pid_file) + + # Run cleanup hooks (e.g., on_ModelEnd) + cleanup_hooks = discover_hooks('ModelEnd') + for hook in cleanup_hooks: + run_hook(hook, ...) + + def has_running_background_hooks(self) -> bool: + """Check if any background hooks still running.""" + if not self.OUTPUT_DIR.exists(): + return False + for pid_file in self.OUTPUT_DIR.glob('*/hook.pid'): + if process_is_alive(pid_file): + return True + return False +``` + +### PWD Standard: +``` +model.OUTPUT_DIR/plugin_name/ +``` +- Crawl: `users/{user}/crawls/{date}/{crawl_id}/plugin_name/` +- Snapshot: `users/{user}/snapshots/{date}/{domain}/{snapshot_id}/plugin_name/` +- ArchiveResult: `users/{user}/snapshots/{date}/{domain}/{snapshot_id}/plugin_name/` (same as Snapshot) +- Dependency: `dependencies/{dependency_id}/plugin_name/` (set output_dir field directly) + +## Implementation Plan + +### Phase 1: Add unified helpers to hooks.py ✅ DONE + +**File**: `archivebox/hooks.py` + +**Status**: COMPLETE - Added three helper functions: +- `process_hook_records(records, overrides)` - lines 1258-1323 +- `process_is_alive(pid_file)` - lines 1326-1344 +- `kill_process(pid_file, sig)` - lines 1347-1362 + +```python +def process_hook_records(records: List[Dict], overrides: Dict = None) -> Dict[str, int]: + """ + Process JSONL records from hook output. + Dispatches to Model.from_jsonl() for each record type. + + Args: + records: List of JSONL record dicts from result['records'] + overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc. + + Returns: + Dict with counts by record type + """ + stats = {} + for record in records: + record_type = record.get('type') + + # Dispatch to appropriate model + if record_type == 'Snapshot': + from archivebox.core.models import Snapshot + Snapshot.from_jsonl(record, overrides) + stats['Snapshot'] = stats.get('Snapshot', 0) + 1 + elif record_type == 'Tag': + from archivebox.core.models import Tag + Tag.from_jsonl(record, overrides) + stats['Tag'] = stats.get('Tag', 0) + 1 + elif record_type == 'Binary': + from archivebox.machine.models import Binary + Binary.from_jsonl(record, overrides) + stats['Binary'] = stats.get('Binary', 0) + 1 + # ... etc + return stats + +def process_is_alive(pid_file: Path) -> bool: + """Check if process in PID file is still running.""" + if not pid_file.exists(): + return False + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, 0) # Signal 0 = check if exists + return True + except (OSError, ValueError): + return False + +def kill_process(pid_file: Path, signal=SIGTERM): + """Kill process in PID file.""" + if not pid_file.exists(): + return + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, signal) + except (OSError, ValueError): + pass +``` + +### Phase 2: Add Model.from_jsonl() static methods ✅ DONE + +**Files**: `archivebox/core/models.py`, `archivebox/machine/models.py`, `archivebox/crawls/models.py` + +**Status**: COMPLETE - Added from_jsonl() to: +- ✅ `Tag.from_jsonl()` - core/models.py lines 93-116 +- ✅ `Snapshot.from_jsonl()` - core/models.py lines 1144-1189 +- ✅ `Machine.from_jsonl()` - machine/models.py lines 66-89 +- ✅ `Dependency.from_jsonl()` - machine/models.py lines 203-227 +- ✅ `Binary.from_jsonl()` - machine/models.py lines 401-434 + +Example implementations added: + +```python +class Snapshot: + @staticmethod + def from_jsonl(record: Dict, overrides: Dict = None): + """Create/update Snapshot from JSONL record.""" + from archivebox.misc.jsonl import get_or_create_snapshot + overrides = overrides or {} + + # Apply overrides (crawl, parent_snapshot, depth limits) + crawl = overrides.get('crawl') + snapshot = overrides.get('snapshot') # parent + + if crawl: + depth = record.get('depth', (snapshot.depth + 1 if snapshot else 1)) + if depth > crawl.max_depth: + return None + record.setdefault('crawl_id', str(crawl.id)) + record.setdefault('depth', depth) + if snapshot: + record.setdefault('parent_snapshot_id', str(snapshot.id)) + + created_by_id = overrides.get('created_by_id') + new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id) + new_snapshot.status = Snapshot.StatusChoices.QUEUED + new_snapshot.retry_at = timezone.now() + new_snapshot.save() + return new_snapshot + +class Tag: + @staticmethod + def from_jsonl(record: Dict, overrides: Dict = None): + """Create/update Tag from JSONL record.""" + from archivebox.misc.jsonl import get_or_create_tag + tag = get_or_create_tag(record) + # Auto-attach to snapshot if in overrides + if overrides and 'snapshot' in overrides: + overrides['snapshot'].tags.add(tag) + return tag + +class Binary: + @staticmethod + def from_jsonl(record: Dict, overrides: Dict = None): + """Create/update Binary from JSONL record.""" + # Implementation similar to existing create_model_record() + ... + +# Etc for other models +``` + +### Phase 3: Update ArchiveResult to use unified pattern ✅ DONE + +**File**: `archivebox/core/models.py` + +**Status**: COMPLETE + +**Changes made**: + +1. ✅ **Replaced inline JSONL processing** (lines 1912-1950): + - Pre-filter Snapshot records for depth/URL constraints in ArchiveResult.run() + - Use `self._url_passes_filters(url)` with parent snapshot's config for proper hierarchy + - Replaced inline Tag/Snapshot/other record creation with `process_hook_records()` + - Removed ~60 lines of duplicate code + +2. ✅ **Simplified Snapshot.from_jsonl()** (lines 1144-1189): + - Removed depth checking (now done in caller) + - Just applies crawl metadata and creates snapshot + - Added docstring note: "Filtering should be done by caller BEFORE calling this method" + +3. ✅ **Preserved ArchiveResult self-update logic**: + - Status/output fields still updated from ArchiveResult JSONL record (lines 1856-1910) + - Special title extractor logic preserved (line 1952+) + - Search indexing trigger preserved (line 1957+) + +4. ✅ **Key insight**: Filtering happens in ArchiveResult.run() where we have parent snapshot context, NOT in from_jsonl() where we'd lose config hierarchy + +**Note**: Did NOT delete special background hook methods (`check_background_completed`, `finalize_background_hook`) - that's Phase 6 + +### Phase 4: Add Snapshot.cleanup() method ✅ DONE + +**File**: `archivebox/core/models.py` + +**Status**: COMPLETE + +**Changes made**: + +1. ✅ **Added Snapshot.cleanup()** (lines 1144-1175): + - Kills background ArchiveResult hooks by scanning for `*/hook.pid` files + - Finalizes background ArchiveResults using `finalize_background_hook()` (temporary until Phase 6) + - Called by state machine when entering sealed state + +2. ✅ **Added Snapshot.has_running_background_hooks()** (lines 1177-1195): + - Checks if any background hooks still running using `process_is_alive()` + - Used by state machine in `is_finished()` check + +### Phase 5: Update SnapshotMachine to use cleanup() ✅ DONE + +**File**: `archivebox/core/statemachines.py` + +**Status**: COMPLETE + +**Changes made**: + +1. ✅ **Simplified is_finished()** (lines 58-72): + - Removed inline background hook checking and finalization (lines 67-76 deleted) + - Now uses `self.snapshot.has_running_background_hooks()` (line 68) + - Removed ~12 lines of duplicate logic + +2. ✅ **Added cleanup() to sealed.enter** (lines 102-111): + - Calls `self.snapshot.cleanup()` to kill background hooks (line 105) + - Follows unified pattern: cleanup happens on seal, not in is_finished() + +### Phase 6: Add ArchiveResult.update_from_output() and simplify run() ✅ DONE + +**File**: `archivebox/core/models.py` + +**Status**: COMPLETE - The BIG refactor (removed ~200 lines of duplication) + +**Changes made**: + +1. ✅ **Added `ArchiveResult.update_from_output()`** (lines 1908-2061): + - Unified method for both foreground and background hooks + - Reads stdout.log and parses JSONL records + - Updates status/output_str/output_json from ArchiveResult JSONL record + - Walks filesystem to populate output_files/output_size/output_mimetypes + - Filters Snapshot records for depth/URL constraints (same as run()) + - Processes side-effect records via `process_hook_records()` + - Updates snapshot title if title extractor + - Triggers search indexing if succeeded + - Cleans up PID files and empty logs + - ~160 lines of comprehensive logic + +2. ✅ **Simplified `ArchiveResult.run()`** (lines 1841-1906): + - Removed ~120 lines of duplicate filesystem reading logic + - Now just sets start_ts/pwd and calls `update_from_output()` + - Background hooks: return immediately after saving status=STARTED + - Foreground hooks: call `update_from_output()` to do all the work + - Removed ~10 lines of duplicate code + +3. ✅ **Updated `Snapshot.cleanup()`** (line 1172): + - Changed from `ar.finalize_background_hook()` to `ar.update_from_output()` + - Uses the unified method instead of the old special-case method + +4. ✅ **Deleted `_populate_output_fields()`** (was ~45 lines): + - Logic merged into `update_from_output()` + - Eliminates duplication of filesystem walking code + +5. ✅ **Deleted `check_background_completed()`** (was ~20 lines): + - Replaced by `process_is_alive(pid_file)` from hooks.py + - Generic helper used by Snapshot.has_running_background_hooks() + +6. ✅ **Deleted `finalize_background_hook()`** (was ~85 lines): + - Completely replaced by `update_from_output()` + - Was duplicate of foreground hook finalization logic + +**Total lines removed**: ~280 lines of duplicate code +**Total lines added**: ~160 lines of unified code +**Net reduction**: ~120 lines (-43%) + +### Phase 7-8: Dependency State Machine ❌ NOT NEEDED + +**Status**: Intentionally skipped - Dependency doesn't need a state machine + +**Why no state machine for Dependency?** + +1. **Wrong Granularity**: Dependency is a GLOBAL singleton (one record per binary name) + - Multiple machines would race to update the same `status`/`retry_at` fields + - No clear semantics: "started" on which machine? "failed" on Machine A but "succeeded" on Machine B? + +2. **Wrong Timing**: Installation should be SYNCHRONOUS, not queued + - When a worker needs wget, it should install wget NOW, not queue it for later + - No benefit to async state machine transitions + +3. **State Lives Elsewhere**: Binary records are the actual state + - Each machine has its own Binary records (one per machine per binary) + - Binary.machine FK provides proper per-machine state tracking + +**Correct Architecture:** +``` +Dependency (global, no state machine): + ├─ Configuration: bin_name, bin_providers, overrides + ├─ run() method: synchronous installation attempt + └─ NO status, NO retry_at, NO state_machine_name + +Binary (per-machine, has machine FK): + ├─ State: is this binary installed on this specific machine? + ├─ Created via JSONL output from on_Dependency hooks + └─ unique_together = (machine, name, abspath, version, sha256) +``` + +**What was implemented:** +- ✅ **Refactored `Dependency.run()`** (lines 249-324): + - Uses `discover_hooks()` and `process_hook_records()` for consistency + - Added comprehensive docstring explaining why no state machine + - Synchronous execution: returns Binary or None immediately + - Uses unified JSONL processing pattern +- ✅ **Kept Dependency simple**: Just configuration fields, no state fields +- ✅ **Multi-machine support**: Each machine independently runs Dependency.run() and creates its own Binary + +## Summary of Changes + +### Progress: 6/6 Core Phases Complete ✅ + 2 Phases Skipped (Intentionally) + +**ALL core functionality is now complete!** The unified pattern is consistently implemented across Crawl, Snapshot, and ArchiveResult. Dependency intentionally kept simple (no state machine needed). + +### Files Modified: + +1. ✅ **DONE** `archivebox/hooks.py` - Add unified helpers: + - ✅ `process_hook_records(records, overrides)` - dispatcher (lines 1258-1323) + - ✅ `process_is_alive(pid_file)` - check if PID still running (lines 1326-1344) + - ✅ `kill_process(pid_file)` - kill process (lines 1347-1362) + +2. ✅ **DONE** `archivebox/crawls/models.py` - Already updated: + - ✅ `Crawl.run()` - runs hooks, processes JSONL, creates snapshots + - ✅ `Crawl.cleanup()` - kills background hooks, runs on_CrawlEnd + +3. ✅ **DONE** `archivebox/core/models.py`: + - ✅ `Tag.from_jsonl()` - lines 93-116 + - ✅ `Snapshot.from_jsonl()` - lines 1197-1234 (simplified, removed filtering) + - ✅ `Snapshot.cleanup()` - lines 1144-1172 (kill background hooks, calls ar.update_from_output()) + - ✅ `Snapshot.has_running_background_hooks()` - lines 1174-1193 (check PIDs) + - ✅ `ArchiveResult.run()` - simplified, uses `update_from_output()` (lines 1841-1906) + - ✅ `ArchiveResult.update_from_output()` - unified filesystem reading (lines 1908-2061) + - ✅ **DELETED** `ArchiveResult.check_background_completed()` - replaced by `process_is_alive()` + - ✅ **DELETED** `ArchiveResult.finalize_background_hook()` - replaced by `update_from_output()` + - ✅ **DELETED** `ArchiveResult._populate_output_fields()` - merged into `update_from_output()` + +4. ✅ **DONE** `archivebox/core/statemachines.py`: + - ✅ Simplified `SnapshotMachine.is_finished()` - uses `has_running_background_hooks()` (line 68) + - ✅ Added cleanup call to `SnapshotMachine.sealed.enter` (line 105) + +5. ✅ **DONE** `archivebox/machine/models.py`: + - ✅ `Machine.from_jsonl()` - lines 66-89 + - ✅ `Dependency.from_jsonl()` - lines 203-227 + - ✅ `Binary.from_jsonl()` - lines 401-434 + - ✅ Refactored `Dependency.run()` to use unified pattern (lines 249-324) + - ✅ Added comprehensive docstring explaining why Dependency doesn't need state machine + - ✅ Kept Dependency simple: no state fields, synchronous execution only + +### Code Metrics: +- **Lines removed**: ~280 lines of duplicate code +- **Lines added**: ~160 lines of unified code +- **Net reduction**: ~120 lines total (-43%) +- **Files created**: 0 (no new files needed) + +### Key Benefits: + +1. **Consistency**: All stateful models (Crawl, Snapshot, ArchiveResult) follow the same unified state machine pattern +2. **Simplicity**: Eliminated special-case background hook handling (~280 lines of duplicate code) +3. **Correctness**: Background hooks are properly cleaned up on seal transition +4. **Maintainability**: Unified `process_hook_records()` dispatcher for all JSONL processing +5. **Testability**: Consistent pattern makes testing easier +6. **Clear Separation**: Stateful work items (Crawl/Snapshot/ArchiveResult) vs stateless config (Dependency) +7. **Multi-Machine Support**: Dependency remains simple synchronous config, Binary tracks per-machine state + +## Final Unified Pattern + +All models now follow this consistent architecture: + +### State Machine Structure +```python +class ModelMachine(StateMachine): + queued = State(initial=True) + started = State() + sealed/succeeded/failed = State(final=True) + + @started.enter + def enter_started(self): + self.model.run() # Execute the work + + @sealed.enter # or @succeeded.enter + def enter_sealed(self): + self.model.cleanup() # Clean up background hooks +``` + +### Model Methods +```python +class Model: + # State machine fields + status = CharField(default='queued') + retry_at = DateTimeField(default=timezone.now) + output_dir = CharField(default='', blank=True) + state_machine_name = 'app.statemachines.ModelMachine' + + def run(self): + """Run hooks, process JSONL, create children.""" + hooks = discover_hooks('EventName') + for hook in hooks: + output_dir = self.OUTPUT_DIR / hook.parent.name + result = run_hook(hook, output_dir=output_dir, ...) + + if result is None: # Background hook + continue + + # Process JSONL records + overrides = {'model': self, 'created_by_id': self.created_by_id} + process_hook_records(result['records'], overrides=overrides) + + def cleanup(self): + """Kill background hooks, run cleanup hooks.""" + for pid_file in self.OUTPUT_DIR.glob('*/hook.pid'): + kill_process(pid_file) + # Update children from filesystem + child.update_from_output() + + def update_and_requeue(self, **fields): + """Update fields and bump modified_at.""" + for field, value in fields.items(): + setattr(self, field, value) + self.save(update_fields=[*fields.keys(), 'modified_at']) + + @staticmethod + def from_jsonl(record: dict, overrides: dict = None): + """Create/update model from JSONL record.""" + # Implementation specific to model + # Called by process_hook_records() +``` + +### Hook Processing Flow +``` +1. Model.run() discovers hooks +2. Hooks execute and output JSONL to stdout +3. JSONL records dispatched via process_hook_records() +4. Each record type handled by Model.from_jsonl() +5. Background hooks tracked via hook.pid files +6. Model.cleanup() kills background hooks on seal +7. Children updated via update_from_output() +``` + +### Multi-Machine Coordination +- **Work Items** (Crawl, Snapshot, ArchiveResult): No machine FK, any worker can claim +- **Resources** (Binary): Machine FK, one per machine per binary +- **Configuration** (Dependency): No machine FK, global singleton, synchronous execution +- **Execution Tracking** (ArchiveResult.iface): FK to NetworkInterface for observability + +## Testing Checklist + +- [ ] Test Crawl → Snapshot creation with hooks +- [ ] Test Snapshot → ArchiveResult creation +- [ ] Test ArchiveResult foreground hooks (JSONL processing) +- [ ] Test ArchiveResult background hooks (PID tracking, cleanup) +- [ ] Test Dependency.run() synchronous installation +- [ ] Test background hook cleanup on seal transition +- [ ] Test multi-machine Crawl execution +- [ ] Test Binary creation per machine (one per machine per binary) +- [ ] Verify Dependency.run() can be called concurrently from multiple machines safely + +## FINAL ARCHITECTURE (Phases 1-8 Complete) + +### ✅ Phases 1-6: Core Models Unified +All core models (Crawl, Snapshot, ArchiveResult) now follow the unified pattern: +- State machines orchestrate transitions +- `.run()` methods execute hooks and process JSONL +- `.cleanup()` methods kill background hooks +- `.update_and_requeue()` methods update state for worker coordination +- Consistent use of `process_hook_records()` for JSONL dispatching + +### ✅ Phases 7-8: Binary State Machine (Dependency Model Eliminated) + +**Key Decision**: Eliminated `Dependency` model entirely and made `Binary` the state machine. + +#### New Architecture +- **Static Configuration**: `plugins/{plugin}/dependencies.jsonl` files define binary requirements + ```jsonl + {"type": "Binary", "name": "yt-dlp", "bin_providers": "pip,brew,apt,env"} + {"type": "Binary", "name": "node", "bin_providers": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}} + {"type": "Binary", "name": "ffmpeg", "bin_providers": "apt,brew,env"} + ``` + +- **Dynamic State**: `Binary` model tracks per-machine installation state + - Fields: `machine`, `name`, `bin_providers`, `overrides`, `abspath`, `version`, `sha256`, `binprovider` + - State machine: `queued → started → succeeded/failed` + - Output dir: `data/machines/{machine_id}/binaries/{binary_name}/{binary_id}/` + +#### Binary State Machine Flow +```python +class BinaryMachine(StateMachine): + queued → started → succeeded/failed + + @started.enter + def enter_started(self): + self.binary.run() # Runs on_Binary__install_* hooks + +class Binary(models.Model): + def run(self): + """ + Runs ALL on_Binary__install_* hooks. + Each hook checks bin_providers and decides if it can handle this binary. + First hook to succeed wins. + Outputs JSONL with abspath, version, sha256, binprovider. + """ + hooks = discover_hooks('Binary') + for hook in hooks: + result = run_hook(hook, output_dir=self.OUTPUT_DIR/plugin_name, + binary_id=self.id, machine_id=self.machine_id, + name=self.name, bin_providers=self.bin_providers, + overrides=json.dumps(self.overrides)) + + # Hook outputs: {"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget", "version": "1.21", "binprovider": "apt"} + # Binary.from_jsonl() updates self with installation results +``` + +#### Hook Naming Convention +- **Before**: `on_Dependency__install_using_pip_provider.py` +- **After**: `on_Binary__install_using_pip_provider.py` + +Each hook checks `--bin-providers` CLI argument: +```python +if 'pip' not in bin_providers.split(','): + sys.exit(0) # Skip this binary +``` + +#### Perfect Symmetry Achieved +All models now follow identical patterns: +```python +Crawl(queued) → CrawlMachine → Crawl.run() → sealed +Snapshot(queued) → SnapshotMachine → Snapshot.run() → sealed +ArchiveResult(queued) → ArchiveResultMachine → ArchiveResult.run() → succeeded/failed +Binary(queued) → BinaryMachine → Binary.run() → succeeded/failed +``` + +#### Benefits of Eliminating Dependency +1. **No global singleton conflicts**: Binary is per-machine, no race conditions +2. **Simpler data model**: One table instead of two (Dependency + Binary) +3. **Static configuration**: dependencies.jsonl in version control, not database +4. **Consistent state machine**: Binary follows same pattern as other models +5. **Cleaner hooks**: Hooks check bin_providers themselves instead of orchestrator parsing names + +#### Multi-Machine Coordination +- **Work Items** (Crawl, Snapshot, ArchiveResult): No machine FK, any worker can claim +- **Resources** (Binary): Machine FK, one per machine per binary name +- **Configuration**: Static files in `plugins/*/dependencies.jsonl` +- **Execution Tracking**: ArchiveResult.iface FK to NetworkInterface for observability + +### Testing Checklist (Updated) +- [x] Core models use unified hook pattern (Phases 1-6) +- [ ] Binary installation via state machine +- [ ] Multiple machines can install same binary independently +- [ ] Hook bin_providers filtering works correctly +- [ ] Binary.from_jsonl() handles both dependencies.jsonl and hook output +- [ ] Binary OUTPUT_DIR structure: data/machines/{machine_id}/binaries/{name}/{id}/ diff --git a/old/TODO_process_tracking.md b/old/TODO_process_tracking.md new file mode 100644 index 0000000000..570c3c6eb6 --- /dev/null +++ b/old/TODO_process_tracking.md @@ -0,0 +1,1947 @@ +# Process Hierarchy Tracking Implementation Plan + +## Overview + +This document outlines the plan to refactor ArchiveBox's process management to use the `machine.Process` model as the central data structure for tracking all subprocess spawning and lifecycle management. + +### Goal + +Create a complete hierarchy of `Process` records that track every subprocess from CLI invocation down to individual binary executions: + +``` +Process(cmd=['archivebox', 'add', 'https://example.com']) # CLI entry + └── Process(cmd=['supervisord', ...], parent=^) # Daemon manager + └── Process(cmd=['orchestrator'], parent=^) # Work distributor + └── Process(cmd=['crawl_worker'], parent=^) # Crawl processor + └── Process(cmd=['snapshot_worker'], parent=^) + └── Process(cmd=['archiveresult_worker'], parent=^) + └── Process(cmd=['hook.py', ...], parent=^) # Hook script + └── Process(cmd=['wget', ...], parent=^) # Binary +``` + +--- + +## Phase 1: Model Changes + +### 1.1 Add `parent` FK to Process Model + +**File:** `archivebox/machine/models.py` + +```python +class Process(ModelWithHealthStats): + # ... existing fields ... + + # NEW: Parent process FK for hierarchy tracking + parent = models.ForeignKey( + 'self', + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name='children', + help_text='Parent process that spawned this one' + ) +``` + +**Migration needed:** Yes, new nullable FK field. + +### 1.2 Add Process Type Field + +To distinguish between different process types in the hierarchy: + +```python +class Process(ModelWithHealthStats): + class TypeChoices(models.TextChoices): + CLI = 'cli', 'CLI Command' + SUPERVISORD = 'supervisord', 'Supervisord Daemon' + ORCHESTRATOR = 'orchestrator', 'Orchestrator' + WORKER = 'worker', 'Worker Process' + HOOK = 'hook', 'Hook Script' + BINARY = 'binary', 'Binary Execution' + + process_type = models.CharField( + max_length=16, + choices=TypeChoices.choices, + default=TypeChoices.BINARY, + db_index=True, + help_text='Type of process in the execution hierarchy' + ) +``` + +### 1.3 Add `Process.current()` Class Method (like `Machine.current()`) + +Following the pattern established by `Machine.current()`, add a method to get-or-create the Process record for the current OS process: + +```python +import os +import sys +import psutil +from datetime import timedelta +from django.utils import timezone + +_CURRENT_PROCESS = None +PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds +PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid +START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching + + +class ProcessManager(models.Manager): + def current(self) -> 'Process': + return Process.current() + + def get_by_pid(self, pid: int, machine: 'Machine' = None) -> 'Process | None': + """ + Find a Process by PID with proper validation against PID reuse. + + IMPORTANT: PIDs are reused by the OS! This method: + 1. Filters by machine (required - PIDs are only unique per machine) + 2. Filters by time window (processes older than 24h are stale) + 3. Validates via psutil that start times match + + Args: + pid: OS process ID + machine: Machine instance (defaults to current machine) + + Returns: + Process if found and validated, None otherwise + """ + machine = machine or Machine.current() + + # Get the actual process start time from OS + try: + os_proc = psutil.Process(pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process doesn't exist - any DB record with this PID is stale + return None + + # Query candidates: same machine, same PID, recent, still RUNNING + candidates = self.filter( + machine=machine, + pid=pid, + status=Process.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, # Only recent processes + ).order_by('-started_at') # Most recent first + + for candidate in candidates: + # Validate start time matches (within tolerance) + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + return candidate + + return None + + +class Process(ModelWithHealthStats): + # ... existing fields ... + + objects: ProcessManager = ProcessManager() + + @classmethod + def current(cls) -> 'Process': + """ + Get or create the Process record for the current OS process. + + Similar to Machine.current(), this: + 1. Checks cache for existing Process with matching PID + 2. Validates the cached Process is still valid (PID not reused) + 3. Creates new Process if needed + + IMPORTANT: Uses psutil to validate PID hasn't been reused. + PIDs are recycled by OS, so we compare start times. + """ + global _CURRENT_PROCESS + + current_pid = os.getpid() + machine = Machine.current() + + # Check cache validity + if _CURRENT_PROCESS: + # Verify: same PID, same machine, cache not expired + if (_CURRENT_PROCESS.pid == current_pid and + _CURRENT_PROCESS.machine_id == machine.id and + timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)): + return _CURRENT_PROCESS + _CURRENT_PROCESS = None + + # Get actual process start time from OS for validation + try: + os_proc = psutil.Process(current_pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied): + os_start_time = None + + # Try to find existing Process for this PID on this machine + # Filter by: machine + PID + RUNNING + recent + start time matches + if os_start_time: + existing = cls.objects.filter( + machine=machine, + pid=current_pid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by('-started_at').first() + + if existing and existing.started_at: + db_start_time = existing.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + _CURRENT_PROCESS = existing + return existing + + # No valid existing record - create new one + parent = cls._find_parent_process(machine) + process_type = cls._detect_process_type() + + # Use psutil start time if available (more accurate than timezone.now()) + if os_start_time: + from datetime import datetime + started_at = datetime.fromtimestamp(os_start_time, tz=timezone.get_current_timezone()) + else: + started_at = timezone.now() + + _CURRENT_PROCESS = cls.objects.create( + machine=machine, + parent=parent, + process_type=process_type, + cmd=sys.argv, + pwd=os.getcwd(), + pid=current_pid, + started_at=started_at, + status=cls.StatusChoices.RUNNING, + ) + return _CURRENT_PROCESS + + @classmethod + def _find_parent_process(cls, machine: 'Machine' = None) -> 'Process | None': + """ + Find the parent Process record by looking up PPID. + + IMPORTANT: Validates against PID reuse by checking: + 1. Same machine (PIDs are only unique per machine) + 2. Start time matches OS process start time + 3. Process is still RUNNING and recent + + Returns None if parent is not an ArchiveBox process. + """ + ppid = os.getppid() + machine = machine or Machine.current() + + # Get parent process start time from OS + try: + os_parent = psutil.Process(ppid) + os_parent_start = os_parent.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Parent process doesn't exist + + # Find matching Process record + candidates = cls.objects.filter( + machine=machine, + pid=ppid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by('-started_at') + + for candidate in candidates: + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE: + return candidate + + return None # No matching ArchiveBox parent process + + @classmethod + def _detect_process_type(cls) -> str: + """ + Detect the type of the current process from sys.argv. + """ + argv_str = ' '.join(sys.argv).lower() + + if 'supervisord' in argv_str: + return cls.TypeChoices.SUPERVISORD + elif 'orchestrator' in argv_str: + return cls.TypeChoices.ORCHESTRATOR + elif any(w in argv_str for w in ['crawl_worker', 'snapshot_worker', 'archiveresult_worker']): + return cls.TypeChoices.WORKER + elif 'archivebox' in argv_str: + return cls.TypeChoices.CLI + else: + return cls.TypeChoices.BINARY + + @classmethod + def cleanup_stale_running(cls, machine: 'Machine' = None) -> int: + """ + Mark stale RUNNING processes as EXITED. + + Processes are stale if: + - Status is RUNNING but OS process no longer exists + - Status is RUNNING but started_at is older than PID_REUSE_WINDOW + + Returns count of processes cleaned up. + """ + machine = machine or Machine.current() + cleaned = 0 + + stale = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + + for proc in stale: + is_stale = False + + # Check if too old (PID definitely reused) + if proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: + is_stale = True + else: + # Check if OS process still exists with matching start time + try: + os_proc = psutil.Process(proc.pid) + if proc.started_at: + db_start = proc.started_at.timestamp() + os_start = os_proc.create_time() + if abs(db_start - os_start) > START_TIME_TOLERANCE: + is_stale = True # PID reused by different process + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + is_stale = True # Process no longer exists + + if is_stale: + proc.status = cls.StatusChoices.EXITED + proc.ended_at = proc.ended_at or timezone.now() + proc.exit_code = proc.exit_code if proc.exit_code is not None else -1 + proc.save(update_fields=['status', 'ended_at', 'exit_code']) + cleaned += 1 + + return cleaned +``` + +**Key Benefits:** +- **Automatic hierarchy**: Calling `Process.current()` from anywhere auto-links to parent +- **Cached**: Like `Machine.current()`, avoids repeated DB queries +- **PID reuse protection**: Validates via psutil start time comparison (PIDs recycle!) +- **Machine-scoped**: All queries filter by `machine=Machine.current()` +- **Time-windowed**: Ignores processes older than 24h (stale PID matches) +- **Self-healing**: `cleanup_stale_running()` marks orphaned processes as EXITED + +**Usage pattern:** +```python +# In any ArchiveBox code that spawns a subprocess: +parent = Process.current() # Get/create record for THIS process +child = Process.objects.create( + parent=parent, + cmd=['wget', ...], + ... +) +child.launch() +``` + +### 1.4 Add Helper Methods for Tree Traversal + +```python +class Process(ModelWithHealthStats): + # ... existing fields ... + + @property + def root(self) -> 'Process': + """Get the root process (CLI command) of this hierarchy.""" + proc = self + while proc.parent_id: + proc = proc.parent + return proc + + @property + def ancestors(self) -> list['Process']: + """Get all ancestor processes from parent to root.""" + ancestors = [] + proc = self.parent + while proc: + ancestors.append(proc) + proc = proc.parent + return ancestors + + @property + def depth(self) -> int: + """Get depth in the process tree (0 = root).""" + return len(self.ancestors) + + def get_descendants(self, include_self: bool = False) -> QuerySet['Process']: + """Get all descendant processes recursively.""" + # Note: For deep hierarchies, consider using django-mptt or django-treebeard + # For now, simple recursive query (limited depth in practice) + from django.db.models import Q + + if include_self: + pks = [self.pk] + else: + pks = [] + + children = list(self.children.values_list('pk', flat=True)) + while children: + pks.extend(children) + children = list(Process.objects.filter(parent_id__in=children).values_list('pk', flat=True)) + + return Process.objects.filter(pk__in=pks) +``` + +### 1.5 Add `Process.proc` Property for Validated psutil Access + +The `proc` property provides a validated `psutil.Process` object, ensuring the PID matches our recorded process (not a recycled PID): + +```python +class Process(ModelWithHealthStats): + # ... existing fields ... + + @property + def proc(self) -> 'psutil.Process | None': + """ + Get validated psutil.Process for this record. + + Returns psutil.Process ONLY if: + 1. Process with this PID exists in OS + 2. OS process start time matches our started_at (within tolerance) + 3. Process is on current machine + + Returns None if: + - PID doesn't exist (process exited) + - PID was reused by a different process (start times don't match) + - We're on a different machine than where process ran + + This prevents accidentally matching a stale/recycled PID. + """ + import psutil + from archivebox.machine.models import Machine + + # Can't get psutil.Process if we don't have a PID + if not self.pid: + return None + + # Can't validate processes on other machines + if self.machine_id != Machine.current().id: + return None + + try: + os_proc = psutil.Process(self.pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Process no longer exists + + # Validate start time matches to prevent PID reuse confusion + if self.started_at: + os_start_time = os_proc.create_time() + db_start_time = self.started_at.timestamp() + + if abs(os_start_time - db_start_time) > START_TIME_TOLERANCE: + # PID has been reused by a different process! + return None + + # Optionally validate command matches (extra safety) + # This catches edge cases where start times are within tolerance + # but it's actually a different process + if self.cmd: + try: + os_cmdline = os_proc.cmdline() + # Check if first arg (binary) matches + if os_cmdline and self.cmd: + os_binary = os_cmdline[0] if os_cmdline else '' + db_binary = self.cmd[0] if self.cmd else '' + # Match by basename (handles /usr/bin/python3 vs python3) + if os_binary and db_binary: + from pathlib import Path + if Path(os_binary).name != Path(db_binary).name: + return None # Different binary, PID reused + except (psutil.AccessDenied, psutil.ZombieProcess): + pass # Can't check cmdline, trust start time match + + return os_proc + + @property + def is_running(self) -> bool: + """ + Check if process is currently running via psutil. + + More reliable than checking status field since it validates + the actual OS process exists and matches our record. + """ + return self.proc is not None and self.proc.is_running() + + def is_alive(self) -> bool: + """ + Alias for is_running, for compatibility with subprocess.Popen API. + """ + return self.is_running + + def get_memory_info(self) -> dict | None: + """Get memory usage if process is running.""" + if self.proc: + try: + mem = self.proc.memory_info() + return {'rss': mem.rss, 'vms': mem.vms} + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_cpu_percent(self) -> float | None: + """Get CPU usage percentage if process is running.""" + if self.proc: + try: + return self.proc.cpu_percent(interval=0.1) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_children_pids(self) -> list[int]: + """Get PIDs of child processes from OS (not DB).""" + if self.proc: + try: + return [child.pid for child in self.proc.children(recursive=True)] + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return [] +``` + +**Key Safety Features:** + +1. **Start time validation**: `psutil.Process.create_time()` must match `self.started_at` within `START_TIME_TOLERANCE` (5 seconds) +2. **Machine check**: Only returns `proc` if on the same machine where process ran +3. **Command validation**: Optional extra check that binary name matches +4. **Returns None on mismatch**: Never returns a stale/wrong psutil.Process + +**Usage:** +```python +process = Process.objects.get(id=some_id) + +# Safe - returns None if PID was recycled +if process.proc: + print(f"Memory: {process.proc.memory_info().rss}") + print(f"CPU: {process.proc.cpu_percent()}") + process.proc.terminate() # Safe to kill - we validated it's OUR process + +# Convenience properties +if process.is_running: + print("Still running!") +``` + +### 1.6 Add Process Lifecycle Methods + +Move logic from `process_utils.py` and `hooks.py` into the model: + +```python +class Process(ModelWithHealthStats): + # ... existing fields ... + + @property + def pid_file(self) -> Path: + """Path to PID file for this process.""" + return Path(self.pwd) / 'process.pid' + + @property + def cmd_file(self) -> Path: + """Path to cmd.sh script for this process.""" + return Path(self.pwd) / 'cmd.sh' + + @property + def stdout_file(self) -> Path: + """Path to stdout log.""" + return Path(self.pwd) / 'stdout.log' + + @property + def stderr_file(self) -> Path: + """Path to stderr log.""" + return Path(self.pwd) / 'stderr.log' + + def _write_pid_file(self) -> None: + """Write PID file with mtime set to process start time.""" + from archivebox.misc.process_utils import write_pid_file_with_mtime + if self.pid and self.started_at: + write_pid_file_with_mtime( + self.pid_file, + self.pid, + self.started_at.timestamp() + ) + + def _write_cmd_file(self) -> None: + """Write cmd.sh script for debugging/validation.""" + from archivebox.misc.process_utils import write_cmd_file + write_cmd_file(self.cmd_file, self.cmd) + + def _build_env(self) -> dict: + """Build environment dict for subprocess, merging stored env with system.""" + import os + env = os.environ.copy() + env.update(self.env or {}) + return env + + def launch(self, background: bool = False) -> 'Process': + """ + Spawn the subprocess and update this Process record. + + Args: + background: If True, don't wait for completion (for daemons/bg hooks) + + Returns: + self (updated with pid, started_at, etc.) + """ + import subprocess + import time + from django.utils import timezone + + # Ensure output directory exists + Path(self.pwd).mkdir(parents=True, exist_ok=True) + + # Write cmd.sh for debugging + self._write_cmd_file() + + with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err: + proc = subprocess.Popen( + self.cmd, + cwd=self.pwd, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + self.pid = proc.pid + self.started_at = timezone.now() + self.status = self.StatusChoices.RUNNING + self.save() + + self._write_pid_file() + + if not background: + try: + proc.wait(timeout=self.timeout) + self.exit_code = proc.returncode + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + self.exit_code = -1 + + self.ended_at = timezone.now() + self.stdout = self.stdout_file.read_text() + self.stderr = self.stderr_file.read_text() + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def is_alive(self) -> bool: + """Check if this process is still running.""" + from archivebox.misc.process_utils import validate_pid_file + + if self.status == self.StatusChoices.EXITED: + return False + + if not self.pid: + return False + + return validate_pid_file(self.pid_file, self.cmd_file) + + def kill(self, signal_num: int = 15) -> bool: + """ + Kill this process and update status. + + Uses self.proc for safe killing - only kills if PID matches + our recorded process (prevents killing recycled PIDs). + + Args: + signal_num: Signal to send (default SIGTERM=15) + + Returns: + True if killed successfully, False otherwise + """ + from django.utils import timezone + + # Use validated psutil.Process to ensure we're killing the right process + proc = self.proc + if proc is None: + # Process doesn't exist or PID was recycled - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Safe to kill - we validated it's our process via start time match + proc.send_signal(signal_num) + + # Update our record + self.exit_code = -signal_num + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + # Clean up PID file + self.pid_file.unlink(missing_ok=True) + + return True + except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError): + # Process already exited between proc check and kill + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def poll(self) -> int | None: + """ + Check if process has exited and update status if so. + + Returns: + exit_code if exited, None if still running + """ + from django.utils import timezone + + if self.status == self.StatusChoices.EXITED: + return self.exit_code + + if not self.is_alive(): + # Process exited - read output and update status + if self.stdout_file.exists(): + self.stdout = self.stdout_file.read_text() + if self.stderr_file.exists(): + self.stderr = self.stderr_file.read_text() + + # Try to get exit code from pid file or default to unknown + self.exit_code = self.exit_code or -1 + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + return self.exit_code + + return None # Still running + + def wait(self, timeout: int | None = None) -> int: + """ + Wait for process to exit, polling periodically. + + Args: + timeout: Max seconds to wait (None = use self.timeout) + + Returns: + exit_code + + Raises: + TimeoutError if process doesn't exit in time + """ + import time + + timeout = timeout or self.timeout + start = time.time() + + while True: + exit_code = self.poll() + if exit_code is not None: + return exit_code + + if time.time() - start > timeout: + raise TimeoutError(f"Process {self.id} did not exit within {timeout}s") + + time.sleep(0.1) +``` + +--- + +## Phase 2: Hook System Changes (Detailed) + +This section provides a line-by-line mapping of current code to required changes. + +### 2.1 Current Architecture Overview + +**Current Flow:** +``` +ArchiveResult.run() [core/models.py:2463] + └── run_hook() [hooks.py:238] + └── subprocess.Popen() [hooks.py:381] + └── writes: stdout.log, stderr.log, hook.pid, cmd.sh +``` + +**Target Flow:** +``` +ArchiveResult.run() + └── run_hook(parent_process=self.process) # Pass existing Process FK + └── hook_process = Process.objects.create(parent=parent_process, type=HOOK) + └── hook_process.launch(background=is_bg) # Uses Process methods + └── writes: stdout.log, stderr.log via Process.stdout_file/stderr_file + └── Process handles PID file internally + └── parse JSONL for {"type": "Process"} records → create child binary Processes +``` + +### 2.2 Changes to `hooks.py` + +#### 2.2.1 Update `run_hook()` Signature and Body + +**File:** `archivebox/hooks.py` lines 238-483 + +**CURRENT CODE (lines 374-398):** +```python +# Set up output files for ALL hooks (useful for debugging) +stdout_file = output_dir / 'stdout.log' +stderr_file = output_dir / 'stderr.log' +pid_file = output_dir / 'hook.pid' +cmd_file = output_dir / 'cmd.sh' + +try: + # Write command script for validation + from archivebox.misc.process_utils import write_cmd_file + write_cmd_file(cmd_file, cmd) + + # Open log files for writing + with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err: + process = subprocess.Popen( + cmd, + cwd=str(output_dir), + stdout=out, + stderr=err, + env=env, + ) + + # Write PID with mtime set to process start time for validation + from archivebox.misc.process_utils import write_pid_file_with_mtime + process_start_time = time.time() + write_pid_file_with_mtime(pid_file, process.pid, process_start_time) + + if is_background: + # Background hook - return None immediately, don't wait + return None +``` + +**NEW CODE:** +```python +def run_hook( + script: Path, + output_dir: Path, + config: Dict[str, Any], + timeout: Optional[int] = None, + parent_process: Optional['Process'] = None, # NEW: from ArchiveResult.process + **kwargs: Any +) -> HookResult: + from archivebox.machine.models import Process, Machine + + # ... existing setup (lines 270-372) ... + + # Create Process record for this hook execution + # Parent is the ArchiveResult's Process (passed from ArchiveResult.run()) + hook_process = Process.objects.create( + machine=Machine.current(), + parent=parent_process, + process_type=Process.TypeChoices.HOOK, + cmd=cmd, + pwd=str(output_dir), + env={k: v for k, v in env.items() if k not in os.environ}, # Only store non-default env + timeout=timeout, + status=Process.StatusChoices.QUEUED, + ) + + # Use Process.launch() which handles: + # - subprocess.Popen + # - PID file with mtime validation + # - cmd.sh script + # - stdout/stderr capture + # - status transitions + if is_background: + hook_process.launch(background=True) + # Return None for background hooks (existing behavior) + # HookResult not returned - caller uses hook_process.id to track + return None + else: + hook_process.launch(background=False) # Blocks until completion + + # Read output from Process (instead of files directly) + stdout = hook_process.stdout + stderr = hook_process.stderr + returncode = hook_process.exit_code + + # ... existing JSONL parsing (lines 427-448) ... + + # NEW: Create child Process records for binaries reported in JSONL + for record in records: + if record.get('type') == 'Process': + Process.objects.create( + machine=hook_process.machine, + parent=hook_process, + process_type=Process.TypeChoices.BINARY, + cmd=record.get('cmd', []), + pwd=record.get('pwd', str(output_dir)), + pid=record.get('pid'), + exit_code=record.get('exit_code'), + started_at=parse_ts(record.get('started_at')), + ended_at=parse_ts(record.get('ended_at')), + status=Process.StatusChoices.EXITED, + ) + + return HookResult( + returncode=returncode, + stdout=stdout, + stderr=stderr, + # ... existing fields ... + process_id=str(hook_process.id), # NEW + ) +``` + +#### 2.2.2 Update `process_is_alive()` to Use Process Model + +**CURRENT CODE (lines 1238-1256):** +```python +def process_is_alive(pid_file: Path) -> bool: + """Check if process in PID file is still running.""" + if not pid_file.exists(): + return False + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, 0) + return True + except (OSError, ValueError): + return False +``` + +**NEW CODE:** +```python +def process_is_alive(pid_file_or_process: 'Path | Process') -> bool: + """ + Check if process is still running. + + Accepts either: + - Path to hook.pid file (legacy) + - Process model instance (new) + """ + from archivebox.machine.models import Process + + if isinstance(pid_file_or_process, Process): + return pid_file_or_process.is_alive() + + # Legacy path-based check (for backwards compatibility) + pid_file = pid_file_or_process + if not pid_file.exists(): + return False + + # Try to find matching Process record + try: + pid = int(pid_file.read_text().strip()) + process = Process.objects.get_by_pid(pid) + if process: + return process.is_alive() + except (ValueError, Process.DoesNotExist): + pass + + # Fallback to OS check + from archivebox.misc.process_utils import validate_pid_file + return validate_pid_file(pid_file) +``` + +#### 2.2.3 Update `kill_process()` to Use Process Model + +**CURRENT CODE (lines 1259-1282):** +```python +def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = True): + """Kill process in PID file with optional validation.""" + from archivebox.misc.process_utils import safe_kill_process + + if validate: + cmd_file = pid_file.parent / 'cmd.sh' + safe_kill_process(pid_file, cmd_file, signal_num=sig) + else: + # Legacy behavior + ... +``` + +**NEW CODE:** +```python +def kill_process( + pid_file_or_process: 'Path | Process', + sig: int = signal.SIGTERM, + validate: bool = True +): + """ + Kill process with optional validation. + + Accepts either: + - Path to hook.pid file (legacy) + - Process model instance (new) + """ + from archivebox.machine.models import Process + + if isinstance(pid_file_or_process, Process): + pid_file_or_process.kill(signal_num=sig) + return + + # Legacy path-based kill + pid_file = pid_file_or_process + + # Try to find matching Process record first + try: + pid = int(pid_file.read_text().strip()) + process = Process.objects.get_by_pid(pid) + if process: + process.kill(signal_num=sig) + return + except (ValueError, Process.DoesNotExist, FileNotFoundError): + pass + + # Fallback to file-based kill + if validate: + from archivebox.misc.process_utils import safe_kill_process + cmd_file = pid_file.parent / 'cmd.sh' + safe_kill_process(pid_file, cmd_file, signal_num=sig) +``` + +### 2.3 Changes to `core/models.py` - ArchiveResult + +#### 2.3.1 Update `ArchiveResult.run()` to Pass Parent Process + +**File:** `archivebox/core/models.py` lines 2463-2565 + +**CURRENT CODE (lines 2527-2535):** +```python +result = run_hook( + hook, + output_dir=plugin_dir, + config=config, + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + crawl_id=str(self.snapshot.crawl.id), + depth=self.snapshot.depth, +) +``` + +**NEW CODE:** +```python +result = run_hook( + hook, + output_dir=plugin_dir, + config=config, + parent_process=self.process, # NEW: Pass our Process as parent for hook's Process + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + crawl_id=str(self.snapshot.crawl.id), + depth=self.snapshot.depth, +) +``` + +#### 2.3.2 Update `ArchiveResult.update_from_output()` to Use Process + +**File:** `archivebox/core/models.py` lines 2568-2700 + +**CURRENT CODE (lines 2598-2600):** +```python +# Read and parse JSONL output from stdout.log +stdout_file = plugin_dir / 'stdout.log' +stdout = stdout_file.read_text() if stdout_file.exists() else '' +``` + +**NEW CODE:** +```python +# Read output from Process record (populated by Process.launch()) +if self.process_id: + # Process already has stdout/stderr from launch() + stdout = self.process.stdout + stderr = self.process.stderr +else: + # Fallback to file-based read (legacy) + stdout_file = plugin_dir / 'stdout.log' + stdout = stdout_file.read_text() if stdout_file.exists() else '' +``` + +### 2.4 Changes to `core/models.py` - Snapshot + +#### 2.4.1 Update `Snapshot.cleanup()` to Use Process Model + +**File:** `archivebox/core/models.py` lines 1381-1401 + +**CURRENT CODE:** +```python +def cleanup(self): + from archivebox.hooks import kill_process + + if not self.OUTPUT_DIR.exists(): + return + + # Find all .pid files in this snapshot's output directory + for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + kill_process(pid_file, validate=True) + + # Update all STARTED ArchiveResults from filesystem + results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED) + for ar in results: + ar.update_from_output() +``` + +**NEW CODE:** +```python +def cleanup(self): + """ + Clean up background ArchiveResult hooks. + + Uses Process model to find and kill running hooks. + Falls back to PID file scanning for legacy compatibility. + """ + from archivebox.machine.models import Process + + # Kill running hook Processes for this snapshot's ArchiveResults + for ar in self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED): + if ar.process_id: + # Get hook Processes that are children of this AR's Process + hook_processes = Process.objects.filter( + parent=ar.process, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + ) + for hook_proc in hook_processes: + hook_proc.kill() + + # Also kill any child binary processes + if ar.process_id: + for child in ar.process.children.filter(status=Process.StatusChoices.RUNNING): + child.kill() + + # Legacy fallback: scan for .pid files not tracked in DB + if self.OUTPUT_DIR.exists(): + from archivebox.hooks import kill_process + for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + kill_process(pid_file, validate=True) + + # Update all STARTED ArchiveResults from filesystem/Process + for ar in self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED): + ar.update_from_output() +``` + +#### 2.4.2 Update `Snapshot.has_running_background_hooks()` to Use Process Model + +**CURRENT CODE (lines 1403-1420):** +```python +def has_running_background_hooks(self) -> bool: + from archivebox.hooks import process_is_alive + + if not self.OUTPUT_DIR.exists(): + return False + + for plugin_dir in self.OUTPUT_DIR.iterdir(): + if not plugin_dir.is_dir(): + continue + pid_file = plugin_dir / 'hook.pid' + if process_is_alive(pid_file): + return True + + return False +``` + +**NEW CODE:** +```python +def has_running_background_hooks(self) -> bool: + """ + Check if any ArchiveResult background hooks are still running. + + Uses Process model for tracking, falls back to PID file check. + """ + from archivebox.machine.models import Process + + # Check via Process model (preferred) + for ar in self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED): + if ar.process_id: + # Check if hook Process children are running + running_hooks = Process.objects.filter( + parent=ar.process, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + ).exists() + if running_hooks: + return True + + # Also check the AR's own process + if ar.process.is_alive(): + return True + + # Legacy fallback: check PID files + if self.OUTPUT_DIR.exists(): + from archivebox.hooks import process_is_alive + for plugin_dir in self.OUTPUT_DIR.iterdir(): + if plugin_dir.is_dir(): + pid_file = plugin_dir / 'hook.pid' + if process_is_alive(pid_file): + return True + + return False +``` + +### 2.5 Hook JSONL Output Contract Update + +Hooks should now output `{"type": "Process", ...}` records for any binaries they run: + +```jsonl +{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded page"} +{"type": "Process", "cmd": ["/usr/bin/wget", "-p", "https://example.com"], "pid": 12345, "exit_code": 0, "started_at": "2024-01-15T10:30:00Z", "ended_at": "2024-01-15T10:30:05Z"} +{"type": "Process", "cmd": ["/usr/bin/curl", "-O", "image.png"], "pid": 12346, "exit_code": 0} +``` + +This allows full tracking of the process hierarchy: +``` +Process(archivebox add, type=CLI) + └── Process(orchestrator, type=ORCHESTRATOR) + └── Process(archiveresult_worker, type=WORKER) + └── Process(on_Snapshot__50_wget.py, type=HOOK) # ArchiveResult.process + └── Process(wget -p ..., type=BINARY) # from JSONL + └── Process(curl -O ..., type=BINARY) # from JSONL +``` + +--- + +## Phase 3: Worker System Changes + +### 3.1 Track Worker Processes in Database (Simplified with Process.current()) + +**File:** `archivebox/workers/worker.py` + +With `Process.current()`, tracking becomes trivial: + +```python +class Worker: + # ... existing code ... + + db_process: 'Process | None' = None # Database Process record + + def on_startup(self) -> None: + """Called when worker starts.""" + from archivebox.machine.models import Process + + self.pid = os.getpid() + self.pid_file = write_pid_file(self.name, self.worker_id) + + # Process.current() automatically: + # - Creates record with correct process_type (detected from sys.argv) + # - Finds parent via PPID (orchestrator) + # - Sets machine, pid, started_at, status + self.db_process = Process.current() + + # ... existing logging ... + + # _get_parent_process() NO LONGER NEEDED - Process.current() uses PPID + + def on_shutdown(self, error: BaseException | None = None) -> None: + """Called when worker shuts down.""" + # ... existing code ... + + # Update database Process record + if self.db_process: + self.db_process.exit_code = 0 if error is None else 1 + self.db_process.ended_at = timezone.now() + self.db_process.status = Process.StatusChoices.EXITED + if error: + self.db_process.stderr = str(error) + self.db_process.save() +``` + +### 3.2 Track Orchestrator Process (Simplified) + +**File:** `archivebox/workers/orchestrator.py` + +```python +class Orchestrator: + # ... existing code ... + + db_process: 'Process | None' = None + + def on_startup(self) -> None: + """Called when orchestrator starts.""" + from archivebox.machine.models import Process + + self.pid = os.getpid() + self.pid_file = write_pid_file('orchestrator', worker_id=0) + + # Process.current() handles everything: + # - Detects type as ORCHESTRATOR from "orchestrator" in sys.argv + # - Finds parent (supervisord) via PPID lookup + self.db_process = Process.current() + + # ... existing logging ... + + # _get_parent_process() NO LONGER NEEDED +``` + +### 3.3 Track Supervisord Process (Detailed) + +**File:** `archivebox/workers/supervisord_util.py` + +Supervisord is special: it's spawned by `subprocess.Popen` (not through Process.current()). +We create its Process record manually after spawning. + +#### 3.3.1 Update Module-Level Variables + +**CURRENT CODE (line 31):** +```python +# Global reference to supervisord process for cleanup +_supervisord_proc = None +``` + +**NEW CODE:** +```python +# Global references for cleanup +_supervisord_proc = None +_supervisord_db_process = None # NEW: Database Process record +``` + +#### 3.3.2 Update `start_new_supervisord_process()` + +**CURRENT CODE (lines 263-278):** +```python +proc = subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=False, +) + +global _supervisord_proc +_supervisord_proc = proc + +time.sleep(2) +return get_existing_supervisord_process() +``` + +**NEW CODE:** +```python +from archivebox.machine.models import Process, Machine +import psutil + +proc = subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=False, +) + +global _supervisord_proc, _supervisord_db_process +_supervisord_proc = proc + +# Create Process record for supervisord +# Parent is Process.current() (the CLI command that started it) +try: + os_proc = psutil.Process(proc.pid) + started_at = datetime.fromtimestamp(os_proc.create_time(), tz=timezone.utc) +except (psutil.NoSuchProcess, psutil.AccessDenied): + started_at = timezone.now() + +_supervisord_db_process = Process.objects.create( + machine=Machine.current(), + parent=Process.current(), # CLI process that spawned supervisord + process_type=Process.TypeChoices.SUPERVISORD, + cmd=['supervisord', f'--configuration={CONFIG_FILE}'], + pwd=str(CONSTANTS.DATA_DIR), + pid=proc.pid, + started_at=started_at, + status=Process.StatusChoices.RUNNING, +) + +time.sleep(2) +return get_existing_supervisord_process() +``` + +#### 3.3.3 Update `stop_existing_supervisord_process()` + +**ADD at end of function (after line 217):** +```python +# Update database Process record +global _supervisord_db_process +if _supervisord_db_process: + _supervisord_db_process.status = Process.StatusChoices.EXITED + _supervisord_db_process.ended_at = timezone.now() + _supervisord_db_process.exit_code = 0 + _supervisord_db_process.save() + _supervisord_db_process = None +``` + +#### 3.3.4 Diagram: Supervisord Process Hierarchy + +``` +Process(archivebox server, type=CLI) # Created by Process.current() in main() + │ + └── Process(supervisord, type=SUPERVISORD) # Created manually in start_new_supervisord_process() + │ + ├── Process(orchestrator, type=ORCHESTRATOR) # Created by Process.current() in Orchestrator.on_startup() + │ │ + │ └── Process(crawl_worker, type=WORKER) + │ │ + │ └── Process(snapshot_worker, type=WORKER) + │ │ + │ └── Process(archiveresult_worker, type=WORKER) + │ │ + │ └── Process(hook, type=HOOK) # ArchiveResult.process + │ │ + │ └── Process(binary, type=BINARY) + │ + └── Process(daphne, type=WORKER) # Web server worker +``` + +Note: Workers spawned BY supervisord (like orchestrator, daphne) are NOT tracked as supervisord's children +in Process hierarchy - they appear as children of the orchestrator because that's where `Process.current()` +is called (in `Worker.on_startup()` / `Orchestrator.on_startup()`). + +The PPID-based linking works because: +1. Supervisord spawns orchestrator process +2. Orchestrator calls `Process.current()` in `on_startup()` +3. `Process.current()` looks up PPID → finds supervisord's Process → sets as parent + +--- + +## Phase 4: CLI Entry Point Changes + +### 4.1 Simplified: Just Call `Process.current()` + +With `Process.current()` implemented, CLI entry becomes trivial: + +**File:** `archivebox/__main__.py` or `archivebox/cli/__init__.py` + +```python +def main(): + from archivebox.machine.models import Process + + # Process.current() auto-creates the CLI process record + # It detects process_type from sys.argv, finds parent via PPID + cli_process = Process.current() + + try: + # ... existing CLI dispatch ... + result = run_cli_command(...) + cli_process.exit_code = result + except Exception as e: + cli_process.exit_code = 1 + cli_process.stderr = str(e) + raise + finally: + cli_process.ended_at = timezone.now() + cli_process.status = Process.StatusChoices.EXITED + cli_process.save() +``` + +**That's it!** No thread-local context needed. `Process.current()` handles: +- Creating the record with correct `process_type` +- Finding parent via PPID lookup +- Caching to avoid repeated queries +- Validating PID hasn't been reused + +### 4.2 Context Management (DEPRECATED - Replaced by Process.current()) + +~~The following is no longer needed since `Process.current()` uses PPID lookup:~~ + +```python +# archivebox/machine/context.py - NO LONGER NEEDED + +# Process.current() replaces all of this by using os.getppid() +# to find parent Process records automatically. + +# OLD approach (don't use): +def get_cli_process() -> Optional['Process']: + """ + Find the CLI process that started this execution. + + Tries: + 1. Thread-local storage (set by main CLI entry point) + 2. Environment variable ARCHIVEBOX_CLI_PROCESS_ID + 3. Query for running CLI process on this machine with matching PPID + """ + # Try thread-local first + process = get_current_cli_process() + if process: + return process + + # Try environment variable + import os + from archivebox.machine.models import Process + + process_id = os.environ.get('ARCHIVEBOX_CLI_PROCESS_ID') + if process_id: + try: + return Process.objects.get(id=process_id) + except Process.DoesNotExist: + pass + + # Fallback: find by PPID + ppid = os.getppid() + return Process.objects.filter( + pid=ppid, + process_type=Process.TypeChoices.CLI, + status=Process.StatusChoices.RUNNING, + ).first() +``` + +--- + +## Phase 5: ArchiveResult Integration + +### 5.1 Update ArchiveResult.run() to Pass Parent Process + +**File:** `archivebox/core/models.py` + +```python +class ArchiveResult(ModelWithOutputDir, ...): + def run(self): + """Execute this ArchiveResult's hook and update status.""" + from archivebox.hooks import run_hook + + # ... existing setup ... + + for hook in hooks: + result = run_hook( + hook, + output_dir=plugin_dir, + config=config, + parent_process=self.process, # NEW: pass our Process as parent + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + crawl_id=str(self.snapshot.crawl.id), + depth=self.snapshot.depth, + ) + + # ... rest of processing ... +``` + +### 5.2 Update ArchiveResult.save() to Link Worker Process + +```python +class ArchiveResult(ModelWithOutputDir, ...): + def save(self, *args, **kwargs): + is_new = self._state.adding + + if is_new and not self.process_id: + from archivebox.machine.models import Process, Machine + from archivebox.machine.context import get_current_worker_process + + # Get the worker's Process as parent + worker_process = get_current_worker_process() + + process = Process.objects.create( + machine=Machine.current(), + parent=worker_process, # NEW: link to worker + process_type=Process.TypeChoices.HOOK, # Will become HOOK when run + pwd=str(Path(self.snapshot.output_dir) / self.plugin), + cmd=[], + status='queued', + timeout=120, + env={}, + ) + self.process = process + + # ... rest of save ... +``` + +--- + +## Phase 6: Migration + +### 6.1 Create Migration File + +```python +# archivebox/machine/migrations/XXXX_add_process_parent_and_type.py + +from django.db import migrations, models +import django.db.models.deletion + +class Migration(migrations.Migration): + dependencies = [ + ('machine', 'XXXX_previous_migration'), + ] + + operations = [ + # Add parent FK + migrations.AddField( + model_name='process', + name='parent', + field=models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name='children', + to='machine.process', + ), + ), + + # Add process_type field + migrations.AddField( + model_name='process', + name='process_type', + field=models.CharField( + choices=[ + ('cli', 'CLI Command'), + ('supervisord', 'Supervisord Daemon'), + ('orchestrator', 'Orchestrator'), + ('worker', 'Worker Process'), + ('hook', 'Hook Script'), + ('binary', 'Binary Execution'), + ], + default='binary', + max_length=16, + db_index=True, + ), + ), + + # Add index for parent queries + migrations.AddIndex( + model_name='process', + index=models.Index( + fields=['parent', 'status'], + name='machine_pro_parent__idx', + ), + ), + ] +``` + +--- + +## Phase 7: Admin UI Updates + +### 7.1 Update Process Admin + +**File:** `archivebox/machine/admin.py` + +```python +@admin.register(Process) +class ProcessAdmin(admin.ModelAdmin): + list_display = ['id', 'process_type', 'cmd_summary', 'status', 'parent_link', 'started_at', 'duration'] + list_filter = ['process_type', 'status', 'machine'] + search_fields = ['cmd', 'stdout', 'stderr'] + readonly_fields = ['parent', 'children_count', 'depth', 'tree_view'] + + def cmd_summary(self, obj): + """Show first 50 chars of command.""" + cmd_str = ' '.join(obj.cmd[:3]) if obj.cmd else '' + return cmd_str[:50] + '...' if len(cmd_str) > 50 else cmd_str + + def parent_link(self, obj): + if obj.parent: + url = reverse('admin:machine_process_change', args=[obj.parent.pk]) + return format_html('{}', url, obj.parent.process_type) + return '-' + + def children_count(self, obj): + return obj.children.count() + + def depth(self, obj): + return obj.depth + + def duration(self, obj): + if obj.started_at and obj.ended_at: + delta = obj.ended_at - obj.started_at + return f'{delta.total_seconds():.1f}s' + elif obj.started_at: + delta = timezone.now() - obj.started_at + return f'{delta.total_seconds():.1f}s (running)' + return '-' + + def tree_view(self, obj): + """Show process tree from root to this process.""" + ancestors = obj.ancestors[::-1] # Reverse to show root first + lines = [] + for i, ancestor in enumerate(ancestors): + prefix = ' ' * i + '└── ' if i > 0 else '' + lines.append(f'{prefix}{ancestor.process_type}: {ancestor.cmd[0] if ancestor.cmd else "?"} (pid={ancestor.pid})') + prefix = ' ' * len(ancestors) + '└── ' if ancestors else '' + lines.append(f'{prefix}[CURRENT] {obj.process_type}: {obj.cmd[0] if obj.cmd else "?"} (pid={obj.pid})') + return format_html('
    {}
    ', '\n'.join(lines)) +``` + +--- + +## Files to Modify Summary + +| File | Changes | +|------|---------| +| `archivebox/machine/models.py` | Add `parent` FK, `process_type` field, `Process.current()`, lifecycle methods | +| `archivebox/machine/migrations/XXXX_*.py` | New migration for schema changes | +| `archivebox/machine/admin.py` | Update admin with tree visualization | +| `archivebox/hooks.py` | Update `run_hook()` to create/use Process records | +| `archivebox/workers/worker.py` | Simplify: just call `Process.current()` in `on_startup()` | +| `archivebox/workers/orchestrator.py` | Simplify: just call `Process.current()` in `on_startup()` | +| `archivebox/workers/supervisord_util.py` | Add `Process.current()` call when starting supervisord | +| `archivebox/core/models.py` | Update ArchiveResult to use `Process.current()` as parent | +| `archivebox/__main__.py` or CLI entry | Call `Process.current()` at startup, update on exit | +| `archivebox/misc/process_utils.py` | Keep as low-level utilities (called by Process methods) | + +**Note:** `archivebox/machine/context.py` is NOT needed - `Process.current()` uses PPID lookup instead of thread-local context. + +--- + +## Testing Plan + +### Unit Tests + +1. **Process hierarchy creation** + - Create nested Process records + - Verify `parent`, `ancestors`, `depth`, `root` properties + - Test `get_descendants()` query + +2. **Process lifecycle** + - Test `launch()` for foreground and background processes + - Test `is_alive()`, `poll()`, `wait()`, `kill()` + - Verify status transitions + +3. **Hook integration** + - Mock hook execution + - Verify hook Process and binary Process records created + - Test parent-child relationships + +### Integration Tests + +1. **Full CLI flow** + - Run `archivebox add https://example.com` + - Verify complete Process tree from CLI → workers → hooks → binaries + - Check all status fields updated correctly + +2. **Worker lifecycle** + - Start orchestrator + - Verify orchestrator and worker Process records + - Stop and verify cleanup + +--- + +## Rollout Strategy + +1. **Phase 1-2**: Model changes + migration (backwards compatible, new fields nullable) +2. **Phase 3**: Worker tracking (can be feature-flagged) +3. **Phase 4**: CLI entry point (can be feature-flagged) +4. **Phase 5-6**: Full integration (requires all previous phases) +5. **Phase 7**: Admin UI (depends on model changes only) + +--- + +## Phase 8: Code Consolidation (Delete Redundant Logic) + +The goal is to consolidate all subprocess management into `Process` model methods, eliminating duplicate logic scattered across the codebase. + +### 8.1 Files to Simplify/Delete + +| File | Current Lines | After Consolidation | Savings | +|------|--------------|---------------------|---------| +| `workers/pid_utils.py` | ~192 lines | DELETE entirely | -192 | +| `misc/process_utils.py` | ~85 lines | Keep as low-level utils | 0 | +| `hooks.py` (run_hook) | ~100 lines | -50 lines (use Process.launch) | -50 | +| `hooks.py` (kill/alive) | ~50 lines | DELETE (use Process.kill/is_running) | -50 | +| `crawls/models.py` (cleanup) | ~100 lines | -70 lines (use Process.kill) | -70 | +| `supervisord_util.py` | ~50 lines process mgmt | -30 lines | -30 | +| **TOTAL** | | | **~-390 lines** | + +### 8.2 Detailed Consolidation Map + +#### `workers/pid_utils.py` → DELETE ENTIRELY + +| Current Function | Replacement | +|------------------|-------------| +| `write_pid_file(worker_type, worker_id)` | `Process.current()` auto-creates | +| `read_pid_file(path)` | `Process.objects.get_by_pid(pid)` | +| `remove_pid_file(path)` | Manual cleanup in `Process.kill()` and legacy hook cleanup code | +| `is_process_alive(pid)` | `Process.is_running` / `Process.proc is not None` | +| `get_all_pid_files()` | `Process.objects.filter(machine=Machine.current(), status=Process.StatusChoices.RUNNING)` | +| `get_all_worker_pids(type)` | `Process.objects.filter(machine=Machine.current(), process_type=type, status=Process.StatusChoices.RUNNING)` | +| `cleanup_stale_pid_files()` | `Process.cleanup_stale_running()` | +| `get_running_worker_count(type)` | `Process.objects.filter(...).count()` | +| `get_next_worker_id(type)` | Use `Max(worker_id)+1` under transaction or DB sequence to avoid race conditions | +| `stop_worker(pid, graceful)` | `Process.terminate(graceful_timeout)` or `Process.kill_tree()` | + +#### `hooks.py` Changes + +**Current `run_hook()` lines 374-398:** +```python +# DELETE these lines - replaced by Process.launch() +stdout_file = output_dir / 'stdout.log' +stderr_file = output_dir / 'stderr.log' +pid_file = output_dir / 'hook.pid' +cmd_file = output_dir / 'cmd.sh' +write_cmd_file(cmd_file, cmd) +with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err: + process = subprocess.Popen(cmd, ...) + write_pid_file_with_mtime(pid_file, process.pid, time.time()) +``` + +**New `run_hook()` using Process:** +```python +# Only store env delta or allowlist to avoid leaking secrets +env_delta = {k: v for k, v in env.items() if k in ALLOWED_ENV_VARS} + +hook_process = Process.objects.create( + parent=parent_process, + process_type=Process.TypeChoices.HOOK, + cmd=cmd, pwd=str(output_dir), env=env_delta, timeout=timeout, +) +hook_process.launch(background=is_background) +# stdout/stderr/pid_file all handled internally by Process.launch() +``` + +**DELETE these functions entirely:** +```python +def process_is_alive(pid_file: Path) -> bool: # lines 1238-1256 +def kill_process(pid_file: Path, sig, validate): # lines 1259-1282 +``` + +**Replace with:** +```python +# Use Process methods directly: +process.is_running # replaces process_is_alive() +process.kill() # replaces kill_process() +``` + +#### `crawls/models.py` Changes + +**Current `Crawl.cleanup()` lines 418-493:** +```python +# DELETE all this inline process logic: +def is_process_alive(pid): + try: + os.kill(pid, 0) + return True + except (OSError, ProcessLookupError): + return False + +for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + if not validate_pid_file(pid_file, cmd_file): + pid_file.unlink(missing_ok=True) + continue + pid = int(pid_file.read_text().strip()) + os.killpg(pid, signal.SIGTERM) + time.sleep(2) + if not is_process_alive(pid): + pid_file.unlink(missing_ok=True) + continue + os.killpg(pid, signal.SIGKILL) + # ... more cleanup logic +``` + +**New `Crawl.cleanup()` using Process:** +```python +def cleanup(self): + # Kill all running child processes for this crawl + for snapshot in self.snapshot_set.all(): + for ar in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED): + if ar.process_id: + # Kill hook process and all its children + ar.process.kill() + for child in ar.process.children.filter(status='running'): + child.kill() + + # Run on_CrawlEnd hooks (foreground) + # ... existing hook running logic ... +``` + +#### `supervisord_util.py` Changes + +**Current global tracking:** +```python +_supervisord_proc = None # subprocess.Popen reference + +def stop_existing_supervisord_process(): + global _supervisord_proc + if _supervisord_proc and _supervisord_proc.poll() is None: + _supervisord_proc.terminate() + _supervisord_proc.wait(timeout=5) + # ... fallback to PID file ... +``` + +**New using Process model:** +```python +_supervisord_db_process = None # Process model instance + +def start_new_supervisord_process(): + # ... existing subprocess.Popen ... + global _supervisord_db_process + _supervisord_db_process = Process.objects.create( + parent=Process.current(), + process_type=Process.TypeChoices.SUPERVISORD, + pid=proc.pid, + cmd=['supervisord', f'--configuration={CONFIG_FILE}'], + started_at=timezone.now(), + status=Process.StatusChoices.RUNNING, + ) + +def stop_existing_supervisord_process(): + global _supervisord_db_process + if _supervisord_db_process: + _supervisord_db_process.kill() # Handles children, PID validation, etc. + _supervisord_db_process = None +``` + +#### `workers/worker.py` Changes + +**Current:** +```python +from .pid_utils import write_pid_file, remove_pid_file, ... + +def on_startup(self): + self.pid = os.getpid() + self.pid_file = write_pid_file(self.name, self.worker_id) + +def on_shutdown(self, error=None): + if self.pid_file: + remove_pid_file(self.pid_file) +``` + +**New:** +```python +# No import needed - Process.current() handles everything + +def on_startup(self): + self.db_process = Process.current() + # Process.current() auto-detects type, finds parent via PPID, creates record + +def on_shutdown(self, error=None): + if self.db_process: + self.db_process.exit_code = 0 if error is None else 1 + self.db_process.status = Process.StatusChoices.EXITED + self.db_process.ended_at = timezone.now() + self.db_process.save() +``` + +### 8.3 New Process Model Methods Summary + +All process operations now go through `Process`: + +```python +# Getting current process +Process.current() # Creates/retrieves Process for os.getpid() + +# Spawning new process +proc = Process.objects.create(parent=Process.current(), cmd=[...], ...) +proc.launch(background=False) # Handles Popen, PID file, stdout/stderr + +# Checking process status +proc.is_running # True if OS process exists and matches +proc.proc # psutil.Process or None (validated) +proc.poll() # Returns exit_code or None + +# Terminating process +proc.kill() # Safe kill with PID validation +proc.kill(SIGKILL) # Force kill + +# Waiting for completion +proc.wait(timeout=30) # Blocks until exit or timeout + +# Cleanup +Process.cleanup_stale_running() # Mark orphaned processes as EXITED +``` + +### 8.4 Benefits + +1. **Single Source of Truth**: All process state in database, queryable +2. **PID Reuse Protection**: `Process.proc` validates via psutil.create_time() +3. **Hierarchy Tracking**: `Process.parent` / `Process.children` for tree traversal +4. **Machine-Scoped**: All queries filter by `machine=Machine.current()` +5. **Audit Trail**: Every subprocess is logged with timestamps, exit codes +6. **No Stale PID Files**: Process records update status automatically + +--- + +## Open Questions + +1. **Performance**: Deep hierarchies with many children could slow queries. Consider: + - Adding `root_id` denormalized field for fast root lookup + - Using django-mptt or django-treebeard for efficient tree queries + - Limiting depth to prevent runaway recursion + +2. **Cleanup**: How long to retain Process records? + - Add `archivebox manage cleanup_processes --older-than=30d` + - Or automatic cleanup via Django management command + +3. **Stdout/Stderr storage**: For large outputs, consider: + - Storing in files and keeping path in DB + - Truncating to first/last N bytes + - Compressing before storage + +4. **Cross-machine hierarchies**: If processes span machines (distributed setup): + - Parent could be on different machine + - May need to relax FK constraint or use soft references diff --git a/old/TODO_rename_extractor_to_plugin.md b/old/TODO_rename_extractor_to_plugin.md new file mode 100644 index 0000000000..5b208a20b6 --- /dev/null +++ b/old/TODO_rename_extractor_to_plugin.md @@ -0,0 +1,517 @@ +# TODO: Rename Extractor to Plugin - Implementation Progress + +**Status**: 🟡 In Progress (2/13 phases complete) +**Started**: 2025-12-28 +**Estimated Files to Update**: ~150+ files + +--- + +## Progress Overview + +### ✅ Completed Phases (2/13) + +- [x] **Phase 1**: Database Migration - Created migration 0033 +- [x] **Phase 2**: Core Model Updates - Updated ArchiveResult, ArchiveResultManager, Snapshot models + +### 🟡 In Progress (1/13) + +- [ ] **Phase 3**: Hook Execution System (hooks.py - all function renames) + +### âŗ Pending Phases (10/13) + +- [ ] **Phase 4**: JSONL Import/Export (misc/jsonl.py) +- [ ] **Phase 5**: CLI Commands (archivebox_extract, archivebox_add, archivebox_update) +- [ ] **Phase 6**: API Endpoints (v1_core.py, v1_cli.py) +- [ ] **Phase 7**: Admin Interface (admin_archiveresults.py, forms.py) +- [ ] **Phase 8**: Views and Templates (views.py, templatetags, progress_monitor.html) +- [ ] **Phase 9**: Worker System (workers/worker.py) +- [ ] **Phase 10**: State Machine (statemachines.py) +- [ ] **Phase 11**: Tests (test_migrations_helpers.py, test_recursive_crawl.py, etc.) +- [ ] **Phase 12**: Terminology Standardization (via_extractor→plugin, comments, docstrings) +- [ ] **Phase 13**: Run migrations and verify all tests pass + +--- + +## What's Been Completed So Far + +### Phase 1: Database Migration ✅ + +**File Created**: `archivebox/core/migrations/0033_rename_extractor_add_hook_name.py` + +Changes: +- Used `migrations.RenameField()` to rename `extractor` → `plugin` +- Added `hook_name` field (CharField, max_length=255, indexed, default='') +- Preserves all existing data, indexes, and constraints + +### Phase 2: Core Models ✅ + +**File Updated**: `archivebox/core/models.py` + +#### ArchiveResultManager +- Updated `indexable()` method to use `plugin__in` and `plugin=method` +- Changed reference from `ARCHIVE_METHODS_INDEXING_PRECEDENCE` to `EXTRACTOR_INDEXING_PRECEDENCE` + +#### ArchiveResult Model +**Field Changes**: +- Renamed field: `extractor` → `plugin` +- Added field: `hook_name` (stores full filename like `on_Snapshot__50_wget.py`) +- Updated comments to reference "plugin" instead of "extractor" + +**Method Updates**: +- `get_extractor_choices()` → `get_plugin_choices()` +- `__str__()`: Now uses `self.plugin` +- `save()`: Logs `plugin` instead of `extractor` +- `get_absolute_url()`: Uses `self.plugin` +- `extractor_module` property → `plugin_module` property +- `output_exists()`: Checks `self.plugin` directory +- `embed_path()`: Uses `self.plugin` for paths +- `create_output_dir()`: Creates `self.plugin` directory +- `output_dir_name`: Returns `self.plugin` +- `run()`: All references to extractor → plugin (including extractor_dir → plugin_dir) +- `update_from_output()`: All references updated to plugin/plugin_dir +- `_update_snapshot_title()`: Parameter renamed to `plugin_dir` +- `trigger_search_indexing()`: Passes `plugin=self.plugin` +- `output_dir` property: Returns plugin directory +- `is_background_hook()`: Uses `plugin_dir` + +#### Snapshot Model +**Method Updates**: +- `create_pending_archiveresults()`: Uses `get_enabled_plugins()`, filters by `plugin=plugin` +- `result_icons` (calc_icons): Maps by `r.plugin`, calls `get_plugin_name()` and `get_plugin_icon()` +- `_merge_archive_results_from_index()`: Maps by `(ar.plugin, ar.start_ts)`, supports both 'extractor' and 'plugin' keys for backwards compat +- `_create_archive_result_if_missing()`: Supports both 'extractor' and 'plugin' keys, creates with `plugin=plugin` +- `write_index_json()`: Writes `'plugin': ar.plugin` in archive_results +- `canonical_outputs()`: Updates `find_best_output_in_dir()` to use `plugin_name`, accesses `result.plugin`, creates keys like `{result.plugin}_path` +- `latest_outputs()`: Uses `get_plugins()`, filters by `plugin=plugin` +- `retry_failed_archiveresults()`: Updated docstring to reference "plugins" instead of "extractors" + +**Total Lines Changed in models.py**: ~50+ locations + +--- + +## Full Implementation Plan + +# ArchiveResult Model Refactoring Plan: Rename Extractor to Plugin + Add Hook Name Field + +## Overview +Refactor the ArchiveResult model and standardize terminology across the codebase: +1. Rename the `extractor` field to `plugin` in ArchiveResult model +2. Add a new `hook_name` field to store the specific hook filename that executed +3. Update all related code paths (CLI, API, admin, views, hooks, JSONL, etc.) +4. Standardize CLI flags from `--extract/--extractors` to `--plugins` +5. **Standardize terminology throughout codebase**: + - "parsers" → "parser plugins" + - "extractors" → "extractor plugins" + - "parser extractors" → "parser plugins" + - "archive methods" → "extractor plugins" + - Document apt/brew/npm/pip as "package manager plugins" in comments + +## Current State Analysis + +### ArchiveResult Model (archivebox/core/models.py:1679-1750) +```python +class ArchiveResult(ModelWithOutputDir, ...): + extractor = models.CharField(max_length=32, db_index=True) # e.g., "screenshot", "wget" + # New fields from migration 0029: + output_str, output_json, output_files, output_size, output_mimetypes + binary = ForeignKey('machine.Binary', ...) + # No hook_name field yet +``` + +### Hook Execution Flow +1. `ArchiveResult.run()` discovers hooks for the plugin (e.g., `wget/on_Snapshot__50_wget.py`) +2. `run_hook()` executes each hook script, captures output as HookResult +3. `update_from_output()` parses JSONL and updates ArchiveResult fields +4. Currently NO tracking of which specific hook file executed + +### Field Usage Across Codebase +**extractor field** is used in ~100 locations: +- **Model**: ArchiveResult.extractor field definition, __str__, manager queries +- **CLI**: archivebox_extract.py (--plugin flag), archivebox_add.py, tests +- **API**: v1_core.py (extractor filter), v1_cli.py (extract/extractors args) +- **Admin**: admin_archiveresults.py (list filter, display) +- **Views**: core/views.py (archiveresult_objects dict by extractor) +- **Template Tags**: core_tags.py (extractor_icon, extractor_thumbnail, extractor_embed) +- **Hooks**: hooks.py (get_extractors, get_extractor_name, run_hook output parsing) +- **JSONL**: misc/jsonl.py (archiveresult_to_jsonl serializes extractor) +- **Worker**: workers/worker.py (ArchiveResultWorker filters by extractor) +- **Statemachine**: statemachines.py (logs extractor in state transitions) + +--- + +## Implementation Plan + +### Phase 1: Database Migration (archivebox/core/migrations/) ✅ COMPLETE + +**Create migration 0033_rename_extractor_add_hook_name.py**: +1. Rename field: `extractor` → `plugin` (preserve index, constraints) +2. Add field: `hook_name` = CharField(max_length=255, blank=True, default='', db_index=True) + - **Stores full hook filename**: `on_Snapshot__50_wget.py`, `on_Crawl__10_chrome_session.js`, etc. + - Empty string for existing records (data migration sets all to '') +3. Update any indexes or constraints that reference extractor + +**Decision**: Full filename chosen for explicitness and easy grep-ability + +**Critical Files to Update**: +- ✅ ArchiveResult model field definitions +- ✅ Migration dependencies (latest: 0032) + +--- + +### Phase 2: Core Model Updates (archivebox/core/models.py) ✅ COMPLETE + +**ArchiveResult Model** (lines 1679-1820): +- ✅ Rename field: `extractor` → `plugin` +- ✅ Add field: `hook_name = models.CharField(...)` +- ✅ Update __str__: `f'...-> {self.plugin}'` +- ✅ Update absolute_url: Use plugin instead of extractor +- ✅ Update embed_path: Use plugin directory name + +**ArchiveResultManager** (lines 1669-1677): +- ✅ Update indexable(): `filter(plugin__in=INDEXABLE_METHODS, ...)` +- ✅ Update precedence: `When(plugin=method, ...)` + +**Snapshot Model** (lines 1000-1600): +- ✅ Update canonical_outputs: Access by plugin name +- ✅ Update create_pending_archiveresults: Use plugin parameter +- ✅ All queryset filters: `archiveresult_set.filter(plugin=...)` + +--- + +### Phase 3: Hook Execution System (archivebox/hooks.py) 🟡 IN PROGRESS + +**Function Renames**: +- [ ] `get_extractors()` → `get_plugins()` (lines 479-504) +- [ ] `get_parser_extractors()` → `get_parser_plugins()` (lines 507-514) +- [ ] `get_extractor_name()` → `get_plugin_name()` (lines 517-530) +- [ ] `is_parser_extractor()` → `is_parser_plugin()` (lines 533-536) +- [ ] `get_enabled_extractors()` → `get_enabled_plugins()` (lines 553-566) +- [ ] `get_extractor_template()` → `get_plugin_template()` (line 1048) +- [ ] `get_extractor_icon()` → `get_plugin_icon()` (line 1068) +- [ ] `get_all_extractor_icons()` → `get_all_plugin_icons()` (line 1092) + +**Update HookResult TypedDict** (lines 63-73): +- [ ] Add field: `hook_name: str` to store hook filename +- [ ] Add field: `plugin: str` (if not already present) + +**Update run_hook()** (lines 141-389): +- [ ] **Add hook_name parameter**: Pass hook filename to be stored in result +- [ ] Update HookResult to include hook_name field +- [ ] Update JSONL record output: Add `hook_name` key + +**Update ArchiveResult.run()** (lines 1838-1914): +- [ ] When calling run_hook, pass the hook filename +- [ ] Store hook_name in ArchiveResult before/after execution + +**Update ArchiveResult.update_from_output()** (lines 1916-2073): +- [ ] Parse hook_name from JSONL output +- [ ] Store in self.hook_name field +- [ ] If not present in JSONL, infer from directory/filename + +**Constants to Rename**: +- [ ] `ARCHIVE_METHODS_INDEXING_PRECEDENCE` → `EXTRACTOR_INDEXING_PRECEDENCE` + +**Comments/Docstrings**: Update all function docstrings to use "plugin" terminology + +--- + +### Phase 4: JSONL Import/Export (archivebox/misc/jsonl.py) + +**Update archiveresult_to_jsonl()** (lines 173-200): +- [ ] Change key: `'extractor': result.extractor` → `'plugin': result.plugin` +- [ ] Add key: `'hook_name': result.hook_name` + +**Update JSONL parsing**: +- [ ] **Accept both 'extractor' (legacy) and 'plugin' (new) keys when importing** +- [ ] Always write 'plugin' key in new exports (never 'extractor') +- [ ] Parse and store hook_name if present (backwards compat: empty if missing) + +**Decision**: Support both keys on import for smooth migration, always export new format + +--- + +### Phase 5: CLI Commands (archivebox/cli/) + +**archivebox_extract.py** (lines 1-230): +- [ ] Rename flag: `--plugin` stays (already correct!) +- [ ] Update internal references: extractor → plugin +- [ ] Update filter: `results.filter(plugin=plugin)` +- [ ] Update display: `result.plugin` + +**archivebox_add.py**: +- [ ] Rename config key: `'EXTRACTORS': plugins` → `'PLUGINS': plugins` (if not already) + +**archivebox_update.py**: +- [ ] Standardize to `--plugins` flag (currently may be --extractors or --extract) + +**tests/test_oneshot.py**: +- [ ] Update flag: `--extract=...` → `--plugins=...` + +--- + +### Phase 6: API Endpoints (archivebox/api/) + +**v1_core.py** (ArchiveResult API): +- [ ] Update schema field: `extractor: str` → `plugin: str` +- [ ] Update schema field: Add `hook_name: str = ''` +- [ ] Update FilterSchema: `q=[..., 'plugin', ...]` +- [ ] Update extractor filter: `plugin: Optional[str] = Field(None, q='plugin__icontains')` + +**v1_cli.py** (CLI API): +- [ ] Rename AddCommandSchema field: `extract: str` → `plugins: str` +- [ ] Rename UpdateCommandSchema field: `extractors: str` → `plugins: str` +- [ ] Update endpoint mapping: `args.plugins` → `plugins` parameter + +--- + +### Phase 7: Admin Interface (archivebox/core/) + +**admin_archiveresults.py**: +- [ ] Update all references: extractor → plugin +- [ ] Update list_filter: `'plugin'` instead of `'extractor'` +- [ ] Update ordering: `order_by('plugin')` +- [ ] Update get_plugin_icon: (rename from get_extractor_icon if exists) + +**admin_snapshots.py**: +- [ ] Update any commented TODOs referencing extractor + +**forms.py**: +- [ ] Rename function: `get_archive_methods()` → `get_plugin_choices()` +- [ ] Update form field: `archive_methods` → `plugins` + +--- + +### Phase 8: Views and Templates (archivebox/core/) + +**views.py**: +- [ ] Update dict building: `archiveresult_objects[result.plugin] = result` +- [ ] Update all extractor references to plugin + +**templatetags/core_tags.py**: +- [ ] **Rename template tags (BREAKING CHANGE)**: + - `extractor_icon()` → `plugin_icon()` + - `extractor_thumbnail()` → `plugin_thumbnail()` + - `extractor_embed()` → `plugin_embed()` +- [ ] Update internal: `result.extractor` → `result.plugin` + +**Update HTML templates** (if any directly reference extractor): +- [ ] Search for `{{ result.extractor }}` and similar +- [ ] Update to `{{ result.plugin }}` +- [ ] Update template tag calls +- [ ] **CRITICAL**: Update JavaScript in `templates/admin/progress_monitor.html`: + - Lines 491, 505: Change `extractor.extractor` and `a.extractor` to use `plugin` field + +--- + +### Phase 9: Worker System (archivebox/workers/worker.py) + +**ArchiveResultWorker**: +- [ ] Rename parameter: `extractor` → `plugin` (lines 348, 350) +- [ ] Update filter: `qs.filter(plugin=self.plugin)` +- [ ] Update subprocess passing: Use plugin parameter + +--- + +### Phase 10: State Machine (archivebox/core/statemachines.py) + +**ArchiveResultMachine**: +- [ ] Update logging: Use `self.archiveresult.plugin` instead of extractor +- [ ] Update any state metadata that includes extractor field + +--- + +### Phase 11: Tests and Fixtures + +**Update test files**: +- [ ] tests/test_migrations_*.py: Update expected field names in schema definitions +- [ ] tests/test_hooks.py: Update assertions for plugin/hook_name fields +- [ ] archivebox/tests/test_migrations_helpers.py: Update schema SQL (lines 161, 382, 468) +- [ ] tests/test_recursive_crawl.py: Update SQL query `WHERE extractor = '60_parse_html_urls'` (line 163) +- [ ] archivebox/cli/tests_piping.py: Update test function names and assertions +- [ ] Any fixtures that create ArchiveResults: Use plugin parameter +- [ ] Any mock objects that set `.extractor` attribute: Change to `.plugin` + +--- + +### Phase 12: Terminology Standardization (NEW) + +This phase standardizes terminology throughout the codebase to use consistent "plugin" nomenclature. + +**via_extractor → plugin Rename (14 files)**: +- [ ] Rename metadata field `via_extractor` to just `plugin` +- [ ] Files affected: + - archivebox/hooks.py - Set plugin in run_hook() output + - archivebox/crawls/models.py - If via_extractor field exists + - archivebox/cli/archivebox_crawl.py - References to via_extractor + - All parser plugins that set via_extractor in output + - Test files with via_extractor assertions +- [ ] Update all JSONL output from parser plugins to use "plugin" key + +**Logging Functions (archivebox/misc/logging_util.py)**: +- [ ] `log_archive_method_started()` → `log_extractor_started()` (line 326) +- [ ] `log_archive_method_finished()` → `log_extractor_finished()` (line 330) + +**Form Functions (archivebox/core/forms.py)**: +- [ ] `get_archive_methods()` → `get_plugin_choices()` (line 15) +- [ ] Form field `archive_methods` → `plugins` (line 24, 29) +- [ ] Update form validation and view usage + +**Comments and Docstrings (81 files with "extractor" references)**: +- [ ] Update comments to say "extractor plugin" instead of just "extractor" +- [ ] Update comments to say "parser plugin" instead of "parser extractor" +- [ ] All plugin files: Update docstrings to use "extractor plugin" terminology + +**Package Manager Plugin Documentation**: +- [ ] Update comments in package manager hook files to say "package manager plugin": + - archivebox/plugins/apt/on_Binary__install_using_apt_provider.py + - archivebox/plugins/brew/on_Binary__install_using_brew_provider.py + - archivebox/plugins/npm/on_Binary__install_using_npm_provider.py + - archivebox/plugins/pip/on_Binary__install_using_pip_provider.py + - archivebox/plugins/env/on_Binary__install_using_env_provider.py + - archivebox/plugins/custom/on_Binary__install_using_custom_bash.py + +**String Literals in Error Messages**: +- [ ] Search for error messages containing "extractor" and update to "plugin" or "extractor plugin" +- [ ] Search for error messages containing "parser" and update to "parser plugin" where appropriate + +--- + +## Critical Files Summary + +### Must Update (Core): +1. ✅ `archivebox/core/models.py` - ArchiveResult, ArchiveResultManager, Snapshot +2. ✅ `archivebox/core/migrations/0033_*.py` - New migration +3. âŗ `archivebox/hooks.py` - All hook execution and discovery functions +4. âŗ `archivebox/misc/jsonl.py` - Serialization/deserialization + +### Must Update (CLI): +5. âŗ `archivebox/cli/archivebox_extract.py` +6. âŗ `archivebox/cli/archivebox_add.py` +7. âŗ `archivebox/cli/archivebox_update.py` + +### Must Update (API): +8. âŗ `archivebox/api/v1_core.py` +9. âŗ `archivebox/api/v1_cli.py` + +### Must Update (Admin/Views): +10. âŗ `archivebox/core/admin_archiveresults.py` +11. âŗ `archivebox/core/views.py` +12. âŗ `archivebox/core/templatetags/core_tags.py` + +### Must Update (Workers/State): +13. âŗ `archivebox/workers/worker.py` +14. âŗ `archivebox/core/statemachines.py` + +### Must Update (Tests): +15. âŗ `tests/test_oneshot.py` +16. âŗ `archivebox/tests/test_hooks.py` +17. âŗ `archivebox/tests/test_migrations_helpers.py` - Schema SQL definitions +18. âŗ `tests/test_recursive_crawl.py` - SQL queries with field names +19. âŗ `archivebox/cli/tests_piping.py` - Test function docstrings + +### Must Update (Terminology - Phase 12): +20. âŗ `archivebox/misc/logging_util.py` - Rename logging functions +21. âŗ `archivebox/core/forms.py` - Rename form helper and field +22. âŗ `archivebox/templates/admin/progress_monitor.html` - JavaScript field refs +23. âŗ All 81 plugin files - Update docstrings and comments +24. âŗ 28 files with parser terminology - Update comments consistently + +--- + +## Migration Strategy + +### Data Migration for Existing Records: +```python +def forwards(apps, schema_editor): + ArchiveResult = apps.get_model('core', 'ArchiveResult') + # All existing records get empty hook_name + ArchiveResult.objects.all().update(hook_name='') +``` + +### Backwards Compatibility: +**BREAKING CHANGES** (per user requirements - no backwards compat): +- CLI flags: Hard cutover to `--plugins` (no aliases) +- API fields: `extractor` removed, `plugin` required +- Template tags: All renamed to `plugin_*` + +**PARTIAL COMPAT** (for migration): +- JSONL: Write 'plugin', but **accept both 'extractor' and 'plugin' on import** + +--- + +## Testing Checklist + +- [ ] Migration 0033 runs successfully on test database +- [ ] All migrations tests pass (test_migrations_*.py) +- [ ] All hook tests pass (test_hooks.py) +- [ ] CLI commands work with --plugins flag +- [ ] API endpoints return plugin/hook_name fields correctly +- [ ] Admin interface displays plugin correctly +- [ ] Admin progress monitor JavaScript works (no console errors) +- [ ] JSONL export includes both plugin and hook_name +- [ ] JSONL import accepts both 'extractor' and 'plugin' keys +- [ ] Hook execution populates hook_name field +- [ ] Worker filtering by plugin works +- [ ] Template tags render with new names (plugin_icon, etc.) +- [ ] All renamed functions work correctly +- [ ] SQL queries in tests use correct field names +- [ ] Terminology is consistent across codebase + +--- + +## Critical Issues to Address + +### 1. via_extractor Field (DECISION: RENAME) +- Currently used in 14 files for tracking which parser plugin discovered a URL +- **Decision**: Rename `via_extractor` → `plugin` (not via_plugin, just "plugin") +- **Impact**: Crawler and parser plugin code - 14 files to update +- Files affected: + - archivebox/hooks.py + - archivebox/crawls/models.py + - archivebox/cli/archivebox_crawl.py + - All parser plugins (parse_html_urls, parse_rss_urls, parse_jsonl_urls, etc.) + - Tests: tests_piping.py, test_parse_rss_urls_comprehensive.py +- This creates consistent naming where "plugin" is used for both: + - ArchiveResult.plugin (which extractor plugin ran) + - URL discovery metadata "plugin" (which parser plugin discovered this URL) + +### 2. Field Size Constraint +- Current: `extractor = CharField(max_length=32)` +- **Decision**: Keep max_length=32 when renaming to plugin +- No size increase needed + +### 3. Migration Implementation +- Use `migrations.RenameField('ArchiveResult', 'extractor', 'plugin')` for clean migration +- Preserves data, indexes, and constraints automatically +- Add hook_name field in same migration + +--- + +## Rollout Notes + +**Breaking Changes**: +1. CLI: `--extract`, `--extractors` → `--plugins` (no aliases) +2. API: `extractor` field → `plugin` field (no backwards compat) +3. Template tags: `extractor_*` → `plugin_*` (users must update custom templates) +4. Python API: All function names with "extractor" → "plugin" (import changes needed) +5. Form fields: `archive_methods` → `plugins` +6. **via_extractor → plugin** (URL discovery metadata field) + +**Migration Required**: Yes - all instances must run migrations before upgrading + +**Estimated Impact**: ~150+ files will need updates across the entire codebase +- 81 files: extractor terminology +- 28 files: parser terminology +- 10 files: archive_method legacy terminology +- Plus templates, JavaScript, tests, etc. + +--- + +## Next Steps + +1. **Continue with Phase 3**: Update hooks.py with all function renames and hook_name tracking +2. **Then Phase 4**: Update JSONL import/export with backwards compatibility +3. **Then Phases 5-12**: Systematically update all remaining files +4. **Finally Phase 13**: Run full test suite and verify everything works + +**Note**: Migration can be tested immediately - the migration file is ready to run! diff --git a/old/archivebox.ts b/old/archivebox.ts new file mode 100644 index 0000000000..44f1669dcc --- /dev/null +++ b/old/archivebox.ts @@ -0,0 +1,6107 @@ +tring'; +import { Readable } from 'node:stream'; +import { finished } from 'node:stream/promises'; +import { URL } from 'node:url'; +import util from 'node:util'; +const exec = util.promisify(child_process.exec); + +import { Readability } from '@mozilla/readability'; +import FileCookieStore from '@root/file-cookie-store'; +import merge from 'deepmerge'; +import { createCursor, getRandomPagePoint } from 'ghost-cursor'; +import { JSDOM, VirtualConsole } from 'jsdom'; +import mime from 'mime-types'; +import ToughCookie from 'tough-cookie'; +import unzip from 'unzip-crx-3'; + +import puppeteer from 'puppeteer'; +import { Browser, Page, Cookie, HTTPResponse } from 'puppeteer'; +import { Cluster } from 'puppeteer-cluster'; +import PupeteerExtra from "puppeteer-extra"; +import Stealth#!/usr/bin/env node --env-file .env +// https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4 + +// npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2 + + +import assert from 'node:assert/strict'; +import { Buffer } from 'node:buffer'; +import child_process from 'node:child_process'; +import crypto from 'node:crypto'; +import fs from 'node:fs'; +import { createServer } from 'node:http'; +import os from 'node:os'; +import path from 'node:path'; +import querystring from 'node:querysPlugin from "puppeteer-extra-plugin-stealth"; +import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences'; +import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder'; +// import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha'; +// import ReplPlugin from 'puppeteer-extra-plugin-repl'; + +const __dirname = import.meta.dirname + +import { getDatabase } from './models/init-models.js'; +const { Tag, Snapshot, ArchiveResult } = await getDatabase({ dbpath: './index.sqlite3' }) + + +// move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt +// update-ca-certificates + + +const ANSI = { + reset: "\x1b[0m", + blue: "\x1b[34m", + black: "\x1b[30m", +} + +/************************* Main Input Arguments *******************************/ +let URLS = [ + // 'chrome://about', + // 'chrome://system/#chrome_root_store', + + 'https://facebook.com/815781663692514/?comment_id=1508571679703640', + 'https://www.instagram.com/p/CrTY1fENHr5/', + 'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400', + 'https://twitter.com/DZasken68678/status/1799833933271687304', + 'https://t.me/IONONMIARRENDOGROUP/13598', + 'https://www.youtube.com/watch?v=rpD0qgzlCms', + 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/', + + + 'https://gologin.com/check-browser', + 'https://arh.antoinevastel.com/bots/areyouheadless', + + 'https://2captcha.com/demo/hcaptcha', + 'https://2captcha.com/demo/cloudflare-turnstile', + 'https://2captcha.com/demo/recaptcha-v3', + 'https://ipinfo.io/', + + // 'https://2captcha.com/demo/recaptcha-v2', + // 'https://2captcha.com/demo/keycaptcha', + // 'https://browserleaks.com/canvas', + // 'https://bot.incolumitas.com/#botChallenge', + // 'https://infosimples.github.io/detect-headless/', + // 'https://coveryourtracks.eff.org/', + // 'https://fingerprint.com/demo/', + // 'https://nowsecure.nl', + // 'https://abrahamjuliot.github.io/creepjs/', + // 'https://scrapfly.io/web-scraping-tools/http2-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/browser-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/audio-fingerprint', + // 'https://scrapfly.io/web-scraping-tools/screen-fingerprint', + // 'https://web-scraping.dev/', + + + // 'https://example.com', + // 'https://www.okta.com/', + // 'https://www.webflow.com/', + // 'https://docker-compose.archivebox.io', + // 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/', + // 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it', + // 'https://x.com/yawnzzcalo7/status/1747853178849435894', + // 'https://twitter.com/yawnzzcalo7/status/1747853178849435894', + // 'https://rachdele.substack.com/p/is-the-job-market-dying', + // 'https://www.flowradar.com/cloneables/mouse-image-trail-effect', + // 'https://wrong.host.badssl.com/', + // 'http://docker-compose.archivebox.io', + // 'https://pptr.dev/api/puppeteer.page.setrequestinterception', + // 'https://blog.sweeting.me#Writing', + // 'https://github.com/yarnpkg/yarn/issues/9005', + + // 'https://archive.md/739Oc', + // 'https://archive.md/Oc72d', + // 'https://archive.vn/fPUBe', + // 'https://archive.vn/mRz4P', + // 'https://archive.vn/Qct6Y', + // 'https://archive.vn/sv50h', + // 'https://facebook.com/815781663692514/?comment_id=1508571679703640', + // 'https://facebook.com/815781663692514/?comment_id=924451748966499', + // 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl', + // 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl', + // 'https://t.me/aubontouite_francais/9493', + // 'https://t.me/BC_BLACKMIROR/5044', + // 'https://t.me/IONONMIARRENDOGROUP/14004', + // 'https://t.me/newsfactory_pl/51014', + // 'https://t.me/oliverjanich/132574', + // 'https://t.me/tomaszgryguc/10449', + // 'https://t.me/amigosDisidentes/123177', + // 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389', + // 'https://twitter.com/4lmondcookie/status/1748519205438111914', + // 'https://twitter.com/4olll1ke/status/1753796944827199766', + // 'https://twitter.com/yeokiloss/status/1754908226179502345', + // 'https://twitter.com/YoungWaifLover/status/1735667278090297561', + // 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182', + // 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/', + // 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/', + // 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/', + // 'https://www.instagram.com/p/CqSM_f9MR4b/', + // 'https://www.instagram.com/p/CqSQgf1sv8B/', + // 'https://instagram.com/p/B-Q22Z_pxyC/', + // 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400', + // 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400', + // 'https://www.youtube.com/watch?v=rpD0qgzlCms', +] + +const isTruthy = (env_value) => ['1', 'yes', 'true'].includes(env_value?.toLowerCase() || 'false') + +/********************** Config: General High-Level Options ********************/ + +const PASSIVE_ARCHIVING = isTruthy(process.env.PASSIVE_ARCHIVING) +const CHROME_CLUSTER = isTruthy(process.env.CHROME_CLUSTER) +const CHROME_CLUSTER_WORKERS = 4 + +const API_SERVER_HOST = '0.0.0.0' +const API_SERVER_PORT = 9595 +const CHROME_DEBUG_PORT = 9222 // 9222 is default, or use 0 for random port + +/********************** Config: Keys & Secrets ********************************/ + +const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE' +const FLARESOLVERR_API_ENDPOINT = process.env.FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1" + +const ACTIVE_PERSONA = process.env.ACTIVE_PERSONA || 'Default' +const CHROME_PROFILE_USER = process.env.CHROME_PROFILE_USER || 'Default' +const LOAD_AUTH_STORAGE = isTruthy(process.env.LOAD_AUTH_STORAGE) +const SAVE_AUTH_STORAGE = isTruthy(process.env.SAVE_AUTH_STORAGE) + +/********************** Config: Data Dir Locations ****************************/ + +const SRC_DIR = path.resolve(__dirname) +const DATA_DIR = process.env.DATA_DIR || await fs.promises.realpath(path.join(SRC_DIR, 'data')) +const INDEXES_DIR = path.join(DATA_DIR, 'index') +const ARCHIVE_DIR = path.join(DATA_DIR, 'archive') +if (!fs.existsSync(ARCHIVE_DIR)) + throw 'Could not find data/archive, are you running in the right pwd?' + +const PERSONA_DIR = path.join(DATA_DIR, 'personas', ACTIVE_PERSONA) +const CHROME_PROFILE_PATH = path.join(PERSONA_DIR, 'chrome_profile') +const CHROME_DOWNLOADS_DIR = path.join(PERSONA_DIR, 'chrome_downloads') +const CHROME_EXTENSIONS_DIR = path.join(PERSONA_DIR, 'chrome_extensions') +const CHROME_EXTENSIONS_JSON_PATH = path.join(CHROME_EXTENSIONS_DIR, 'extensions.json') +const AUTH_JSON_PATH = path.join(PERSONA_DIR, 'auth.json') +const COOKIES_TXT_PATH = path.join(PERSONA_DIR, 'cookies.txt') +const SPEEDTESTS_DIR = path.join(PERSONA_DIR, 'speedtests') +// const CHROME_PROFILE_IMPORT_USER = 'Profile 1' +// const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome' + +// chrome profile / persona directories +fs.mkdirSync(PERSONA_DIR, {recursive: true}) +fs.mkdirSync(SPEEDTESTS_DIR, {recursive: true}) +fs.mkdirSync(CHROME_PROFILE_PATH, {recursive: true}) +fs.mkdirSync(CHROME_EXTENSIONS_DIR, {recursive: true}) +fs.mkdirSync(CHROME_DOWNLOADS_DIR, {recursive: true}) + +// cruft directories +const ORPHANS_DIR = path.join(DATA_DIR, 'orphans') +const PARTIALS_DIR = path.join(DATA_DIR, 'partials') +const DUPLICATES_DIR = path.join(DATA_DIR, 'duplicates') +await fs.promises.mkdir(ORPHANS_DIR, {recursive: true}) +await fs.promises.mkdir(PARTIALS_DIR, {recursive: true}) +await fs.promises.mkdir(DUPLICATES_DIR, {recursive: true}) + +/********************** Config: Viewport Setup Opts ***************************/ + +// Config: Viewport +const DEFAULT_TIMEOUT = 20_000 +const DEFAULT_GEOLOCATION = {latitude: 59.95, longitude: 30.31667} +const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36' +const DEFAULT_ASPECT_RAIO = 16/9 // recommended: 16:9 (most common desktop window aspect ratio) +const SCREENSHOT_ASPECT_RATIO = 4/3 // recommended: 4:3 (easier to use as thumbnails when square-ish) +const DEFAULT_WINDOW_WIDTH = 1920 // recommended: 1920x1080p (1080p screenshots) +const DEFAULT_WINDOW_HEIGHT = Math.floor(DEFAULT_WINDOW_WIDTH/DEFAULT_ASPECT_RAIO) +const DEFAULT_VIEWPORT = { + width: DEFAULT_WINDOW_WIDTH, + height: DEFAULT_WINDOW_HEIGHT, + deviceScaleFactor: 2, // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU + isMobile: false, + hasTouch: false, + isLandscape: false, +} +const DEFAULT_COLOR_SCHEME = 'light' +const DEFAULT_HEADERS = { + // requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc. + // 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + // 'accept-encoding': 'gzip, deflate, br, zstd', + // 'accept-language': accept_language, + // 'cache-Control': no_cache ? 'no-cache' : '', + // 'dnt': '1', + 'sec-ch-ua': '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'connection-rtt': '50', + // 'pragma': no_cache ? 'no-cache' : '', + // 'sec-fetch-dest': 'document', + // 'sec-fetch-mode': 'navigate', + // 'sec-fetch-site': 'none', + // 'sec-fetch-user': '?1', + // // 'upgrade-insecure-requests': '1', // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect + // 'user-agent': user_agent, +} + +const DEFAULT_REFERRERS = ["https://www.google.com", "https://www.facebook.com", "https://www.instagram.com"] + +/****************** Config: Human Behavior Emulation **************************/ + +const SCROLL_LIMIT = 20; // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec +const SCROLL_DELAY = 1350; // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE +const SCROLL_DISTANCE = DEFAULT_VIEWPORT.height - 100; // make sure this is slightly less than viewport height so there is some overlap to make stitching easier + +/********************** Config: URL Rewriting *********************************/ +const URL_REWRITES = [ + // replacements should come first + // { + // idx: 0, + // pattern: /\/\/(www\.)?x\.com/gi, + // replacement: '//$1twitter.com/', + // // TODO: scope: 'hostname', + // }, + // { + // idx: 1, + // pattern: /\/\/(www\.)?twitter\.com/gi, + // replacement: '//$1nitter.net', + // // TODO: scope: 'hostname', + // }, + + // // blocks should come at the end + // { + // idx: 999, + // pattern: /\/\/(www\.)?notallowed\.com/gi, + // replacement: '', + // // TODO: scope: 'href', + // }, +] +const URL_SCHEMES_IGNORED = [ + '', // no scheme is also invalid (e.g. opening a new tab page without any url yet) + 'chrome', + 'chrome-extension', + 'chrome-untrusted', + 'file', + 'data', + 'about', +] + + +/**************** Load existing data/archive/ snapshots *************/ + +const snapshots = await Snapshot.findAll({ attributes: ['id', 'timestamp', 'url'] }) // include: { model: ArchiveResult, as: 'archiveresults' }, }); +const results = await ArchiveResult.findAll({ attributes: ['id', 'snapshot_id', 'extractor', 'start_ts'] }) // include: { model: Snapshot, as: 'snapshot' }, }); +globalThis.snapshots = snapshots +globalThis.results = results +console.log(`[đŸ’ŋ] Found ${snapshots.length} existing snapshots in index.sqlite3...`) +console.log(`[đŸ’ŋ] Found ${results.length} existing results in index.sqlite3...`) +// debugger; + +const locateExistingSnapshots = (archive_dir) => { + const urls_to_dirs = {} + // for each data/archive//index.json found, store {url: data/archive/} + for (const snapshot_dir of fs.readdirSync(archive_dir)) { + const snapshot_json = path.join(archive_dir, snapshot_dir, 'index.json') + if (fs.existsSync(snapshot_json)) { + const {url, archive_path} = JSON.parse(fs.readFileSync(snapshot_json, 'utf-8')) + if (!snapshot_dir.includes(archive_path.replace('archive/', ''))) + throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir + if (url && url.includes('://')) { + urls_to_dirs[url] = path.join(archive_dir, snapshot_dir) + } + } + } + return urls_to_dirs +} + +let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR) + +let all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR)) +// const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999')) + +// // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/ +// for (const snap_id of orphan_snap_dirs) { +// if (snap_id.startsWith('.')) continue +// const src_dir = path.join(ARCHIVE_DIR, snap_id) +// let src_path = src_dir + +// assert((await fs.promises.stat(src_dir)).isDirectory()) +// let dest_path = null + +// const orphan_metrics_path = path.join(src_dir, 'metrics.json') +// if (fs.existsSync(orphan_metrics_path)) { +// const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8')) +// const url = orphan_metrics.url || orphan_metrics.URL +// const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time) + +// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version +// await symlinkBestSnapshotResults(src_dir) + +// dest_path = SNAPSHOT_DIRS_BY_URL[url] +// const dest_id = dest_path?.split('/').at(-1) + +// if (dest_id && (dest_id != snap_id)) { +// if (fs.existsSync(dest_path)) { +// console.log(` - moving duplicate snap_dir ${src_dir} -> ${dest_path}`) +// } else { +// console.log(` - moving valid snap_dir ${src_dir} -> ${dest_path}`) +// } +// } else if (dest_id == snap_id) { +// continue +// } else { +// dest_path = path.join(ORPHANS_DIR, snap_id) +// console.log(` - moving orphan snap_dir ${src_dir} -> ${dest_path}`) +// } +// } else { +// // corrupt/par +// dest_path = path.join(PARTIALS_DIR, snap_id) +// console.log(` - moving parial snap_dir ${src_dir} -> ${dest_path}`) +// } +// if (dest_path) { +// for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) { +// const version_src = path.join(src_path, 'versions', version_dir) +// const version_dst = path.join(dest_path, 'versions', version_dir) + +// // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version +// await symlinkBestSnapshotResults(dest_path) + +// assert(!fs.existsSync(version_dst)) +// await fs.promises.rename(version_src, version_dst) +// console.log(' - ', version_src, '--->', version_dst) +// } +// await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id)) +// await symlinkBestSnapshotResults(dest_path) +// } +// } + +// const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999')) +// for (const snap_id of duplicate_snap_dirs) { +// const src_dir = path.join(DUPLICATES_DIR, snap_id) +// const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8')) +// } + +// all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR)) +// for (const snap_id of all_snap_dirs) { +// if (snap_id.startsWith('.')) continue +// const snap_dir = path.join(ARCHIVE_DIR, snap_id) +// const metrics_path = path.join(snap_dir, 'metrics.json') +// if (fs.existsSync(metrics_path)) { +// // console.log(' - updating snap_dir', snap_dir) +// await symlinkBestSnapshotResults(snap_dir) +// } +// } +// SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR) + + +fs.writeFileSync(path.join(DATA_DIR, 'queue.csv'), '') + +const snapIdFromDir = (dir_path) => + dir_path.split('/archive/').at(-1) + +const snapshot_dir_list = ( + Object.entries(SNAPSHOT_DIRS_BY_URL) + .sort(([_ak, a], [_bk, b]) => + Number(snapIdFromDir(b)) - Number(snapIdFromDir(a))) + .reverse()) + +for (const [existing_url, snapshot_dir] of snapshot_dir_list) { + // if (existing_url.startsWith('https://www.facebook.com/')) { + const is_desired_url = !(existing_url.includes('facebook.com/') || existing_url.includes('instagram.com/')) + const already_archived = false // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions')) + if (is_desired_url && !already_archived) { + // URLS.push(existing_url) + fs.appendFileSync( + path.join(DATA_DIR, 'queue.csv'), + `${SNAPSHOT_DIRS_BY_URL[existing_url]},${existing_url}\n`, + 'utf-8', + ) + } +} +URLS = [...new Set(URLS)] +console.log('[+] Added', URLS.length, 'existing urls to queue...') + + +/********************** Config: Output Paths **********************************/ +// const TASK_PATH = (url) => path.join(DATA_DIR, 'results', `${hashCode(url)}`) +const TASK_PATH = (url) => SNAPSHOT_DIRS_BY_URL[url] || path.join(ARCHIVE_DIR, `1999999999.${hashCode(url)}`) +// const TASK_PATH = (url) => { +// const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url] +// assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`) +// return existing_snap_dir +// } + +const OUTPUT_PATH = (page, filename, extname='') => + path.join(TASK_PATH(page._original_url), `${filename}${extname}`) + +const SSL_PATH = (page) => OUTPUT_PATH(page, 'ssl.json') +const CONSOLELOG_PATH = (page) => OUTPUT_PATH(page, 'console.log') +const HEADERS_PATH = (page) => OUTPUT_PATH(page, 'headers.json') +const REDIRECTS_PATH = (page) => OUTPUT_PATH(page, 'redirects.json') +const REQUESTS_PATH = (page) => OUTPUT_PATH(page, 'requests.json') +const TRACE_PATH = (page) => OUTPUT_PATH(page, 'trace.json') +const METRICS_PATH = (page) => OUTPUT_PATH(page, 'metrics.json') +const OUTLINKS_PATH = (page) => OUTPUT_PATH(page, 'outlinks.json') +const SEO_PATH = (page) => OUTPUT_PATH(page, 'seo.json') +const FAVICON_PATH = (page) => OUTPUT_PATH(page, 'favicon.json') +const TITLE_PATH = (page) => OUTPUT_PATH(page, 'title.txt') +const BODYTEXT_PATH = (page) => OUTPUT_PATH(page, 'body.txt') +const PANDOC_PATH = (page) => OUTPUT_PATH(page, 'pandoc.md') +const READABILITY_PATH = (page) => OUTPUT_PATH(page, 'readability.json') +const ACCESIBILITY_PATH = (page) => OUTPUT_PATH(page, 'accessibility.json') +const DOM_PATH = (page) => OUTPUT_PATH(page, 'dom.html') +const PDF_PATH = (page) => OUTPUT_PATH(page, 'output.pdf') +const SCREENSHOT_PATH = (page) => OUTPUT_PATH(page, 'screenshot.png') +const SCREENSHOT_JPG_PATH = (page) => OUTPUT_PATH(page, 'screenshot.jpg') +const AIQA_PATH = (page) => OUTPUT_PATH(page, 'aiqa.json') +const SINGLEFILE_PATH = (page) => OUTPUT_PATH(page, 'singlefile.html') +const YTDLP_PATH = (page) => OUTPUT_PATH(page, 'media/') +const GALLERYDL_PATH = (page) => OUTPUT_PATH(page, 'photos/') +const SCREENRECORDING_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.mp4') +const SCREENRECORDGIF_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.gif') +const RESPONSES_PATH = (page) => OUTPUT_PATH(page, 'responses') +const RAW_PATH = (page) => OUTPUT_PATH(page, 'raw') + + + +/********************** Config: Chrome Extensions *****************************/ + +interface ChromeExtension { + name: string + webstore_id: string +} +interface LoadedChromeExtension extends ChromeExtension { + id?: string + webstore_url?: string + crx_url?: string + crx_path?: string + unpacked_path?: string + read_manifest?: () => any + read_version?: () => string | null +} + +const CHROME_EXTENSIONS: LoadedChromeExtension[] = [ + // Content access / unblocking / blocking plugins + {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'twocaptcha'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer + {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'}, + {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'}, + // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'}, + // {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'}, + // {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'}, + // {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'}, + + // Archiving plugins + {webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'}, + // {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'}, + // {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'}, + // {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'}, + // {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'}, + + // Utilities for humans setting up/viewing/debugging the archiving session + // {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'}, + // {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'}, + // {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'}, + + // Scripting/automation plugins + // {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'}, + // {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'}, + // {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'}, +] + +/******************** Config: Chrome Profile Preferences **********************/ + +// https://niek.github.io/chrome-features/ +const CHROME_DISABLED_COMPONENTS = [ + 'Translate', + 'AcceptCHFrame', + 'OptimizationHints', + 'ProcessPerSiteUpToMainFrameThreshold', + 'InterestFeedContentSuggestions', + 'CalculateNativeWinOcclusion', + 'BackForwardCache', + 'HeavyAdPrivacyMitigations', + 'LazyFrameLoading', + 'ImprovedCookieControls', + 'PrivacySandboxSettings4', + 'AutofillServerCommunication', + 'CertificateTransparencyComponentUpdater', + 'DestroyProfileOnBrowserClose', + 'CrashReporting', + 'OverscrollHistoryNavigation', + 'InfiniteSessionRestore', + //'LockProfileCookieDatabase', // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624 +] + +const CHROME_PREFERENCES_EXTRA = {} +const CHROME_PREFERENCES_DEFAULT = { + // https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc + homepage: 'about:blank', // doesn't work here, managed by Secure Preferences + homepage_is_newtabpage: false, // doesn't work here, managed by Secure Preferences + session: { // doesn't work here, managed by Secure Preferences + restore_on_startup: 4, // doesn't work here, managed by Secure Preferences + startup_urls: 'about:blank', // doesn't work here, managed by Secure Preferences + }, + default_apps: 'noinstall', + browser: { + confirm_to_quit: false, + enable_spellchecking: false, + check_default_browser: false, + show_update_promotion_info_bar: false, + }, + profile: { + // name: 'ArchiveBox Persona: Default', // doesnt work to change display name, not sure why + // using_default_name: false, + exited_cleanly: true, + default_content_setting_values: { + automatic_downloads: 1, + }, + }, + bookmark_bar: {show_on_all_tabs: false}, + safebrowsing: {enabled: false}, + search: {suggest_enabled: false}, + download: { + prompt_for_download: false, + open_pdf_in_system_reader: true, + // default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'), + }, + select_file_dialogs: {allowed: false}, + autofill: {save_data: false}, + printing: {enabled: false}, + message_center: {welcome_notification_dismissed_local: true}, + extensions: { + ui: { + developer_mode: true, + dismissed_adt_promo: true, + }, + // pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [], + }, + webkit: { + webprefs: { + javascript_enabled: true, + minimum_font_size: 9, + // default_font_size: 12, + // web_security_enabled: false, + // allow_displaying_insecure_content: true, + // allow_running_insecure_content: true, + java_enabled: true, + loads_images_automatically: true, + }, + }, + settings: { + multi_profile_never_show_intro: true, + multi_profile_warning_show_dismissed: true, + first_run_tutorial_shown: true, + }, + plugins: { + always_open_pdf_externally: true, + }, +} + +const CHROME_PREFERENCES_PATH = path.join(CHROME_PROFILE_PATH, 'Default', 'Preferences') + +const getChromePreferences = ({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_EXTENSIONS, CHROME_DOWNLOADS_DIR}) => + merge.all([CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, { + extensions: { + pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [], + }, + download: { + default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'), + }, + }]) + +function applyChromePreferences(puppeteer, prefs_path, preferences) { + if (fs.existsSync(prefs_path)) { + const preferences_existing = JSON.parse(fs.readFileSync(prefs_path, 'utf-8')) + const preferences_merged = merge(preferences_existing, preferences) + // console.log(JSON.stringify(preferences_merged, null, 4)) + fs.writeFileSync(prefs_path, JSON.stringify(preferences_merged)) + } else { + // otherwise profile has not been created yet, use plugin instead (plugin only works on first creation) + puppeteer.use(PrefsPlugin({userPrefs: preferences})) + } + return puppeteer +} + + +/******************** Config: Chrome Launch Args ******************************/ + +const CHROME_ARGS_DEFAULT = [ + // Headless behavior tuning, determinstic behavior settings + // '--headless=new', + '--test-type', + '--test-type=gpu', // https://github.com/puppeteer/puppeteer/issues/10516 + '--deterministic-mode', + '--js-flags=--random-seed=1157259159', // make all JS random numbers deterministic by providing a seed + '--allow-pre-commit-input', // allow JS mutations before page rendering is complete + '--disable-blink-features=AutomationControlled', // hide the signatures that announce browser is being remote-controlled + '--enable-automation', // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare + // `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`, // send all network traffic through a proxy https://2captcha.com/proxy + // `--proxy-bypass-list=127.0.0.1`, + + // Docker-specific options + // https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained + // '--no-sandbox', // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing + // '--disable-gpu-sandbox', + // '--disable-setuid-sandbox', + // '--disable-dev-shm-usage', // docker 75mb default shm size is not big enough, disabling just uses /tmp instead + // '--no-xshm', + + // Profile data dir setup + // chrome://profile-internals + `--user-data-dir=${CHROME_PROFILE_PATH}`, + `--profile-directory=${CHROME_PROFILE_USER}`, + '--password-store=basic', // use mock keychain instead of OS-provided keychain (we manage auth.json instead) + '--use-mock-keychain', + '--disable-cookie-encryption', // we need to be able to write unencrypted cookies to save/load auth.json + // '--disable-sync', // don't try to use Google account sync features + + // Extensions + // chrome://inspect/#extensions + // `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`, // not needed when using existing profile that already has extensions installed + `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({ webstore_id }) => webstore_id).join(',')}`, + '--allow-legacy-extension-manifests', + + // Browser window and viewport setup + // chrome://version + // `--user-agent="${DEFAULT_USER_AGENT}"`, + // `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`, + '--window-position=0,0', + '--hide-scrollbars', // hide scrollbars because otherwise they show up in screenshots + '--install-autogenerated-theme=169,32,85', // red border makes it easier to see which chrome window is archivebox's + '--autoplay-policy=no-user-gesture-required', // auto-start videos so they trigger network requests + show up in outputs + '--disable-gesture-requirement-for-media-playback', + '--lang=en-US,en;q=0.9', + + // DANGER: JS isolation security features (to allow easier tampering with pages during archiving) + // chrome://net-internals + // '--disable-web-security', // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com) + // '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com) + // '--allow-running-insecure-content', // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect + // '--allow-file-access-from-files', // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs + + // // DANGER: Disable HTTPS verification + // '--ignore-certificate-errors', + // '--ignore-ssl-errors', + // '--ignore-certificate-errors-spki-list', + // '--allow-insecure-localhost', + + // IO: stdin/stdout, debug port config + // chrome://inspect + '--log-level=2', // 1=DEBUG 2=WARNING 3=ERROR + '--enable-logging=stderr', + '--remote-debugging-address=0.0.0.0', + `--remote-debugging-port=${CHROME_DEBUG_PORT}`, + + // GPU, canvas, text, and pdf rendering config + // chrome://gpu + '--enable-webgl', // enable web-gl graphics support + '--font-render-hinting=none', // make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;} + '--force-color-profile=srgb', // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb + '--disable-partial-raster', // make rendering more deterministic (TODO: verify if still needed) + '--disable-skia-runtime-opts', // make rendering more deterministic by avoiding Skia hot path runtime optimizations + '--disable-2d-canvas-clip-aa', // make rendering more deterministic by disabling antialiasing on 2d canvas clips + // '--disable-gpu', // falls back to more consistent software renderer + // // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw + // // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu + // // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS) + // // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas) + // // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading) + + // Process management & performance tuning + // chrome://process-internals + '--disable-lazy-loading', // make rendering more deterministic by loading all content up-front instead of on-focus + '--disable-renderer-backgrounding', // dont throttle tab rendering based on focus/visibility + '--disable-background-networking', // dont throttle tab networking based on focus/visibility + '--disable-background-timer-throttling', // dont throttle tab timers based on focus/visibility + '--disable-backgrounding-occluded-windows', // dont throttle tab window based on focus/visibility + '--disable-ipc-flooding-protection', // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail + '--disable-extensions-http-throttling', // dont throttle http traffic based on runtime heuristics + '--disable-field-trial-config', // disable shared field trial state between browser processes + '--disable-back-forward-cache', // disable browsing navigation cache + // '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS) + // '--disable-component-extensions-with-background-pages', // TODO: check this, disables chrome components that only run in background (could lower startup time) + + // uncomment to disable hardware camera/mic/speaker access + present fake devices to websites + // (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings) + // '--use-fake-device-for-media-stream', + // '--use-fake-ui-for-media-stream', + // '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider', + + // // Output format options (PDF, screenshot, etc.) + '--export-tagged-pdf', // include table on contents and tags in printed PDFs + '--generate-pdf-document-outline', + + // Suppress first-run features, popups, hints, updates, etc. + // chrome://system + '--no-pings', + '--no-first-run', + '--no-default-browser-check', + '--disable-default-apps', + '--ash-no-nudges', + '--disable-infobars', + '--disable-search-engine-choice-screen', + '--disable-session-crashed-bubble', + '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"', + '--hide-crash-restore-bubble', + '--suppress-message-center-popups', + '--disable-client-side-phishing-detection', + '--disable-domain-reliability', + '--disable-component-update', + '--disable-datasaver-prompt', + '--disable-hang-monitor', + '--disable-session-crashed-bubble', + '--disable-speech-synthesis-api', + '--disable-speech-api', + '--disable-print-preview', + '--safebrowsing-disable-auto-update', + '--deny-permission-prompts', + '--disable-external-intent-requests', + '--disable-notifications', + '--disable-desktop-notifications', + '--noerrdialogs', + '--disable-popup-blocking', + '--disable-prompt-on-repost', + '--silent-debugger-extension-api', + '--block-new-web-contents', + '--metrics-recording-only', + '--disable-breakpad', + + + // other feature flags + // chrome://flags chrome://components + `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`, + '--enable-features=NetworkService', +] +const CHROME_ARGS_EXTRA = [] + + +const CHROME_LAUNCH_OPTIONS = { + CHROME_PROFILE_PATH, + CHROME_PROFILE_USER, + CHROME_EXTENSIONS, + CHROME_DEBUG_PORT, + CHROME_DISABLED_COMPONENTS, + DEFAULT_VIEWPORT, + CHROME_ARGS_DEFAULT, + CHROME_ARGS_EXTRA, +} +/* Chrome CLI Args Documentation + - https://github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md + - https://chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc + - https://jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45 + - https://peter.sh/experiments/chromium-command-line-switches/ + - https://www.chromium.org/developers/how-tos/run-chromium-with-flags/ + - https://github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md +*/ +const getChromeArgs = ({CHROME_ARGS_DEFAULT, CHROME_ARGS_EXTRA, + CHROME_PROFILE_PATH, CHROME_PROFILE_USER, + CHROME_EXTENSIONS, + CHROME_DEBUG_PORT, + CHROME_DISABLED_COMPONENTS, + DEFAULT_VIEWPORT}=CHROME_LAUNCH_OPTIONS) => + [ + ...CHROME_ARGS_DEFAULT, + `--user-data-dir=${CHROME_PROFILE_PATH}`, + `--profile-directory=${CHROME_PROFILE_USER}`, + `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`, + `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({id}) => id).join(',')}`, + `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`, + `--remote-debugging-port=${CHROME_DEBUG_PORT}`, + `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`, + ...CHROME_ARGS_EXTRA, + ] + + +/******************** Chrome Extension Management *****************************/ + +function getExtensionId(unpacked_path) { + const manifest_path = path.join(unpacked_path, 'manifest.json') + if (!fs.existsSync(manifest_path)) return null + + // chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id + const hash = crypto.createHash('sha256'); + hash.update(Buffer.from(unpacked_path, 'utf-8')); + const detected_extension_id = Array.from(hash.digest('hex')) + .slice(0, 32) // Convert each hexadecimal character to a character in the range 'a'-'p' + .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0))) + .join(''); + + return detected_extension_id +} + +async function installExtension(extension) { + const manifest_path = path.join(extension.unpacked_path, 'manifest.json') + + // Download extensions using: + // curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx + // unzip -d extensionname extensionname.zip + + if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) { + console.log("[đŸ› ī¸] Downloading missing extension", extension.name, extension.webstore_id, '->', extension.crx_path); + + // Download crx file from ext.crx_url -> ext.crx_path + const response = await fetch(extension.crx_url) as Response + const crx_file = fs.createWriteStream(extension.crx_path); + if (response.headers.get("content-length") && response.body) { + // @ts-ignore + const crx_stream = Readable.fromWeb(response.body) + await finished(crx_stream.pipe(crx_file)) + } else { + console.warn('[âš ī¸] Failed to download extension', extension.name, extension.webstore_id) + } + } + + var {stdout, stderr} = {stdout: '', stderr: ''} + + // Unzip crx file from ext.crx_url -> ext.unpacked_path + await fs.promises.mkdir(extension.unpacked_path, {recursive: true}) + try { + var {stdout, stderr} = await exec(`/usr/bin/unzip ${extension.crx_path} -d ${extension.unpacked_path}`) + } catch(err1) { + try { + await unzip(extension.crx_path, extension.unpacked_path) + } catch(err2) { + // console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2) + // return false + } + } + + if (!fs.existsSync(manifest_path)) + console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`, stdout, stderr) + + return fs.existsSync(manifest_path) +} + +async function loadOrInstallExtension(ext) { + if (!(ext.webstore_id || ext.unpacked_path)) + throw 'Extension must have either {webstore_id} or {unpacked_path}' + + // Set statically computable extension metadata + ext.webstore_id = ext.webstore_id || ext.id + ext.name = ext.name || ext.webstore_id + ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}` + ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc` + ext.crx_path = ext.crx_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`) + ext.unpacked_path = ext.unpacked_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`) + + const manifest_path = path.join(ext.unpacked_path, 'manifest.json') + ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8')) + ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null + + // if extension is not installed, download and unpack it + if (!ext.read_version()) { + await installExtension(ext) + } + + // autodetect id from filesystem path (unpacked extensions dont have stable IDs) + ext.id = getExtensionId(ext.unpacked_path) + ext.version = ext.read_version() + if (!ext.version) { + console.warn('[❌] Unable to detect ID and version of installed extension', prettyPath(ext.unpacked_path)) + } else { + console.log(`[➕] Installed extension ${ext.name} (${ext.version})...`.padEnd(82), prettyPath(ext.unpacked_path)) + } + + return ext +} + +async function isTargetExtension(target) { + let target_type + let target_ctx + let target_url + try { + target_type = target.type() + target_ctx = (await target.worker()) || (await target.page()) || null + target_url = target.url() || target_ctx?.url() || null + } catch(err) { + if (String(err).includes('No target with given id found')) { + // because this runs on initial browser startup, we sometimes race with closing the initial + // new tab page. it will throw a harmless error if we try to check a target that's already closed, + // ignore it and return null since that page is definitely not an extension's bg page anyway + target_type = 'closed' + target_ctx = null + target_url = 'about:closed' + } else { + throw err + } + } + + const target_is_bg = ['service_worker', 'background_page'].includes(target_type) + const target_is_extension = target_url?.startsWith('chrome-extension://') + const extension_id = (target_is_extension && target_url.split('://')[1].split('/')[0]) || null + const manifest_version = target_type === 'service_worker' ? '3' : '2' + + return { + target_type, + target_ctx, + target_url, + target_is_bg, + target_is_extension, + extension_id, + manifest_version, + } +} + +async function loadExtensionFromTarget(extensions, target) { + const { + target_is_bg, + target_is_extension, + target_type, + target_ctx, + target_url, + extension_id, + manifest_version, + } = await isTargetExtension(target) + + if (!(target_is_bg && extension_id && target_ctx)) + return null + + const manifest = await target_ctx.evaluate(() => + // @ts-ignore + chrome.runtime.getManifest()) + + const { name, version, homepage_url, options_page, options_ui } = manifest + + if (!version || !extension_id) + return null + + const options_url = await target_ctx.evaluate( + (options_page) => chrome.runtime.getURL(options_page), + options_page || options_ui?.page || 'options.html', + ) + + const commands = await target_ctx.evaluate(async () => + (await new Promise((resolve, reject) => { + if (chrome.commands) + chrome.commands.getAll(resolve) + else + resolve({}) + })) + ) + + // console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length) + + let dispatchEval = async (...args) => + await target_ctx.evaluate(...args) + let dispatchPopup = async () => + await target_ctx.evaluate('chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})') + + let dispatchAction + let dispatchMessage + let dispatchCommand + + if (manifest_version === '3') { + dispatchAction = async (tab) => { + // https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked + return await target_ctx.evaluate(async (tab) => { + tab = tab || (await new Promise((resolve) => + chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab)))) + // @ts-ignore + return await chrome.action.onClicked.dispatch(tab) + }, tab) + } + dispatchMessage = async (message, options) => { + // https://developer.chrome.com/docs/extensions/reference/api/runtime + return await target_ctx.evaluate(async (extension_id, message, options) => { + return await chrome.runtime.sendMessage(extension_id, message, options) + }, extension_id, message, options) + } + dispatchCommand = async (command, tab) => { + // https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand + return await target_ctx.evaluate(async (command, tab) => { + // @ts-ignore + return await chrome.commands.onCommand.dispatch(command, tab) + }, command, tab) + } + } else if (manifest_version === '2') { + dispatchAction = async (tab) => { + // https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked + return await target_ctx.evaluate(async (tab) => { + tab = tab || (await new Promise((resolve) => + chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab)))) + // @ts-ignore + return await chrome.browserAction.onClicked.dispatch(tab) + }, tab) + } + dispatchMessage = async (message, options) => { + // https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage + return await target_ctx.evaluate(async (extension_id, message, options) => { + return await new Promise((resolve) => + chrome.runtime.sendMessage(extension_id, message, options, resolve) + ) + }, extension_id, message, options) + } + dispatchCommand = async (command, tab) => { + // https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand + return await target_ctx.evaluate(async (command, tab) => { + return await new Promise((resolve) => + // @ts-ignore + chrome.commands.onCommand.dispatch(command, tab, resolve) + ) + }, command, tab) + } + } + const existing_extension = extensions.filter(({id}) => id === extension_id)[0] || {} + + const new_extension = { + ...existing_extension, + id: extension_id, + webstore_name: name, + + target, + target_ctx, + target_type, + target_url, + + manifest_version, + manifest, + version, + homepage_url, + options_url, + + dispatchEval, // run some JS in the extension's service worker context + dispatchPopup, // open the extension popup + dispatchAction, // trigger an extension menubar icon click + dispatchMessage, // send a chrome runtime message in the service worker context + dispatchCommand, // trigger an extension keyboard shortcut command + } + + console.log(`[➕] Loaded extension ${name.substring(0, 32)} (${version}) ${target_type}...`.padEnd(82), target_url) + Object.assign(existing_extension, new_extension) + + return new_extension +} + + + +async function getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) { + console.log('*************************************************************************') + console.log(`[âš™ī¸] Installing ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`) + try { + // read extension metadata from filesystem (installing from Chrome webstore if extension is missing) + for (const extension of CHROME_EXTENSIONS) { + Object.assign(extension, await loadOrInstallExtension(extension)) + } + + // for easier debugging, write parsed extension info to filesystem + await overwriteFile( + CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.present.json'), + CHROME_EXTENSIONS, + ) + } catch(err) { + console.error(err) + } + console.log('*************************************************************************') + return CHROME_EXTENSIONS +} + +let _EXTENSIONS_CACHE = null +async function getChromeExtensionsFromCache({browser, extensions=CHROME_EXTENSIONS, extensions_dir=CHROME_EXTENSIONS_DIR}) { + if (_EXTENSIONS_CACHE === null) { + console.log(`[âš™ī¸] Loading ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`) + + // find loaded Extensions at runtime / browser launch time & connect handlers + // looks at all the open targets for extension service workers / bg pages + for (const target of browser.targets()) { + // mutates extensions object in-place to add metadata loaded from filesystem persona dir + await loadExtensionFromTarget(extensions, target) + } + _EXTENSIONS_CACHE = extensions + + // write installed extension metadata to filesystem extensions.json for easier debugging + await overwriteFile( + CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'), + extensions, + ) + await overwriteSymlink( + CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'), + CHROME_EXTENSIONS_JSON_PATH, + ) + } + + return _EXTENSIONS_CACHE +} + +async function setup2CaptchaExtension({browser, extensions}) { + let page = null + try { + // open a new tab to finish setting up the 2captcha extension manually using its extension options page + page = await browser.newPage() + const { options_url } = extensions.filter(ext => ext.name === 'twocaptcha')[0] + await page.goto(options_url) + await wait(2_500) + await page.bringToFront() + + // type in the API key and click the Login button (and auto-close success modal after it pops up) + await page.evaluate(() => { + const elem = document.querySelector("input[name=apiKey]") as HTMLInputElement + elem.value = "" + }) + await page.type('input[name=apiKey]', API_KEY_2CAPTCHA, { delay: 25 }) + + // toggle all the important switches to ON + await page.evaluate(() => { + const checkboxes = Array.from(document.querySelectorAll('input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]')); + for (const checkbox of checkboxes) { + if (!checkbox.checked) checkbox.click() + } + }) + + let dialog_opened = false + page.on('dialog', async (dialog) => { + setTimeout(async () => { + await dialog.accept(); + dialog_opened = true + }, 500); + }) + await page.click('button#connect') + await wait(2_500) + if (!dialog_opened) { + throw `2captcha extension login confirmation dialog never opened, please check its options page manually: ${options_url}` + } + console.log('[🔑] Configured the 2captcha extension using its options page...') + } catch(err) { + console.warn(`[❌] Failed to configure the 2captcha extension using its options page!`, err) + } + if (page) await page.close() +} + +async function speedtest({browser, page, measureUpload=true, timeout=25000}: {browser?: Browser, page?: Page, measureUpload?: boolean, timeout?: number}) { + // run a speedtest using fast.com, printing results once per second + + browser = browser || await page.browser() + page = page || await browser.newPage() + + // save one speedtest_.json result per day + const today = versionStrFromDate(new Date(), {withDate: true, withTime: false}) + const SPEEDTEST_PATH = path.join(SPEEDTESTS_DIR, `speedtest_${today}.json`) + + // check if we've already run one today, if so return earlier results and skip running again + try { + return JSON.parse(await fs.promises.readFile(SPEEDTEST_PATH, 'utf-8')) + } catch(err) { + // otherwise speedtest does not exist yet for today, continue onwards... + } + + console.log('[🚤] Running Speedtest using Fast.com...'.padEnd(82), prettyPath(SPEEDTEST_PATH)) + + await page.goto('https://fast.com', {timeout, waitUntil: 'domcontentloaded'}); + await page.waitForSelector('#speed-value', {timeout}) + + let result = null + let loop_idx = 0 + + while (loop_idx < 100) { + result = await page.evaluate(() => { + const $ = document.querySelector.bind(document); + + return { + downloadSpeed: Number($('#speed-value').textContent), + downloadUnit: $('#speed-units').textContent.trim(), + downloaded: Number($('#down-mb-value').textContent.trim()), + uploadSpeed: Number($('#upload-value').textContent), + uploadUnit: $('#upload-units').textContent.trim(), + uploaded: Number($('#up-mb-value').textContent.trim()), + latency: Number($('#latency-value').textContent.trim()), + bufferBloat: Number($('#bufferbloat-value').textContent.trim()), + userLocation: $('#user-location').textContent.trim(), + userIp: $('#user-ip').textContent.trim(), + isDone: Boolean($('#speed-value.succeeded') && $('#upload-value.succeeded')), + }; + }) + if (result.downloadSpeed > 0) { + // console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', '')) + } + + if (result.isDone || (!measureUpload && result.uploadSpeed)) { + break + } + + await wait(500) + loop_idx++ + } + + await Promise.allSettled([ + page.close(), + overwriteFile(SPEEDTEST_PATH, result) + ]) + + return result +} + +/******************************************************************************/ +/******************************************************************************/ + +const ALREADY_ARCHIVED = new Set(['', 'about:blank', 'chrome://newtab', 'chrome://version']) +const TASKS_PER_RUN_LIMIT = 200 + +async function botArchiveTask({page, data, url=''}) { + url = url || data // puppeteer-cluster passes in the url value via the data: arg + + const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0]) + const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096)) + if (is_unarchivable_url || is_already_archived) return null + ALREADY_ARCHIVED.add(url.slice(0, 4096)) + + if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) { + console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.') + console.warn(' Run this process again to continue with the next batch...') + process.exit(21) + } + + const browser = await page.browser() + const client = await page.target().createCDPSession() + const extensions = await getChromeExtensionsFromCache({browser}) + const browser_version = await browser.version() + const original_url = url.toString() + const start_time = (new Date()) + + console.log('[0/4]-------------------------------------------------------------------------') + const snapshot_dir = await setupSnapshotDir({original_url, start_time}) + const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir}) + console.log('[1/4]-------------------------------------------------------------------------') + console.log(`[đŸĒŸ] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`) + + + const page_state = { + // global static state + browser, + client, + browser_version, + extensions, + + // per-page static metadata + original_url, + snapshot, + snapshot_dir, + start_time: start_time.toISOString(), + start_ts: Number(start_time), + version: versionStrFromDate(start_time), + + // per-page mutable archiving state + main_response: null, + recorder: null, + console_log: [], + traffic_log: {}, + redirects: {}, + } + page._original_url = original_url + + try { + // run all page setup functions in parallel + const results = await Promise.allSettled([ + // loadAuthStorage(page, page_state, { apply: true }), + startMetadataRecording(page, page_state), + setupURLRewriting(page, page_state), + // setupViewport(page, page_state), + setupModalAutoClosing(page, page_state), + loadCloudflareCookie(page, page_state), + startResponseSaving(page, page_state), + saveYTDLP(page, page_state), + saveGALLERYDL(page, page_state), + // saveSourceMaps(page, page_state), + // TODO: someday setup https://github.com/osnr/TabFS ? + ]); + // run all page setup functions in parallel + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason); + if (rejected.length) console.warn('[âš ī¸] Partial failures during page setup:', rejected); + } catch(err) { + console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4)) + return + } + + + console.log('[2/4]-------------------------------------------------------------------------') + + console.log('[âžĄī¸] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset) + const startrecording_promise = startScreenrecording(page, page_state) + page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000}) + try { + const results = await Promise.allSettled([ + startrecording_promise, + page.bringToFront(), + page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}), + ]) + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + if (rejected.length) console.warn('[âš ī¸] Parial failures during page load:', rejected) + } catch(err) { + console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4)) + return + } + + if (page_state.main_response === null) { + page_state.main_response = await page.waitForResponse(() => true) + } + assert(page_state.main_response) + if (page_state.main_response.status() == 429) { + throw `[âš ī¸] Got 429 rate-limit response, skipping this URL for now...` + } + + // emulate human browsing behavior + // await disableAnimations(page, page_state); + await jiggleMouse(page, page_state); + await solveCaptchas(page, page_state); + await blockRedirects(page, page_state); + await scrollDown(page, page_state); + // await expandComments(page, page_state); + await submitForm(page, page_state); + // await blockJSExecution(page, page_state); + + console.log('[3/4]-------------------------------------------------------------------------') + + // stop tampering with page requests & JS / recording metadata / traffic log + await stopMetadataRecording(page, page_state) + + // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff + const saveScreenrecording_promise = saveScreenrecording(page, page_state); + await saveScreenshot(page, page_state); + await savePDF(page, page_state); + + console.log('[4/4]-------------------------------------------------------------------------') + + // do all async archiving steps that can be run at the same time + await inlineShadowDOM(page, page_state); + const results = await Promise.allSettled([ + saveTitle(page, page_state), + saveSEO(page, page_state), + saveFavicon(page, page_state), + saveSSL(page, page_state), + saveRequests(page, page_state), + saveRedirects(page, page_state), + saveHeaders(page, page_state), + saveRaw(page, page_state), + saveDOM(page, page_state), + saveBodyText(page, page_state), + // savePandoc(page, page_state), + saveReadability(page, page_state), + saveAccessibility(page, page_state), + saveOutlinks(page, page_state), + // saveAuthStorage(page, page_state), + saveAIQualityAssuranceResult(page, page_state), + ]); + + // do all sync archiving steps that require browser extensions at the very end (they are the buggiest) + const bg_results = Promise.allSettled([ + saveScreenrecording_promise, + saveSinglefile(page, page_state), + // saveArchiveWebPage(page, page_state), + // savePocket(page, page_state), + ]) + + const {duration} = await saveMetrics(page, page_state); + + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises + + if (rejected.length) + console.warn('[âš ī¸] Parial failures during archiving:', rejected) + + // Start an interactive REPL here with the `page` instance. + // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl + // await page.repl() + // await page.browser().repl() + + console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`) + + try { + const rejected = (await bg_results) + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises + if (rejected.length) + console.warn('[âš ī¸] Parial failures during wrap-up tasks:', rejected) + + console.log('[đŸ—‘ī¸] Resetting to about:blank to ensure memory is freed...') + await page.goto('about:blank') + await page.close() + } catch(err) { + console.log(err) + } + + // symlink the best results from across all the versions/ into the snapshot dir root + await symlinkBestSnapshotResults(snapshot_dir) + + // display latest version screenshot GIF + console.log() + try { + const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page))) + const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000}) + child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']}) + } catch(err) { + console.warn('[âš ī¸] Failed to display screenrecording.gif...', err) + console.log() + } + + // determine whether task succeeded or failed based on AI QA score + const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page))) + const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString()) + if (qa_results.pct_visible < 50) { + throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}` + } else { + console.log(`[đŸ’Ģ] Task completed succesfully: ${qa_results.pct_visible}% ${qa_results.warnings.join(', ') || ''}`) + console.log(` Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`) + return true + } +} + +async function passiveArchiveTask({browser, page, url}) { + // archive passively (e.g. a tab that was opened already by a human), without changing the active page + + const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0]) + const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096)) + if (is_unarchivable_url || is_already_archived) return null + ALREADY_ARCHIVED.add(url.slice(0, 4096)) + + // these have to be as early as possible because we're racing with the page load (we might even be too late) + // jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request + // await page.setRequestInterception(true); + // await page.setCacheEnabled(false); + + const original_url = url.toString() + const start_time = (new Date()) + const browser_version = await browser.version() + + console.log('------------------------------------------------------------------------------') + console.log('[➕] Starting archive of new tab opened in driver browser...', await browser.version()) + const snapshot_dir = await setupSnapshotDir({original_url, start_time}) + const snapshot = await setupSnapshotDB({ original_url, start_time, snapshot_dir }) + console.log('------------------------------------------------------------------------------') + console.log(`[đŸĒŸ] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`) + + // create a new page in the background for archiving + const old_page = page + page = await browser.newPage() + await old_page.bringToFront() + const client = await page.target().createCDPSession() + const extensions = await getChromeExtensionsFromCache({ browser }) + + const page_state = { + // global static state + browser, + client, + browser_version, + extensions, + + // per-page static metadata + original_url, + snapshot, + snapshot_dir, + start_time: start_time.toISOString(), + start_ts: Number(start_time), + version: versionStrFromDate(start_time), + + // per-page mutable archiving state + main_response: null, + recorder: null, + console_log: [], + traffic_log: {}, + redirects: {}, + } + page._original_url = original_url + + try { + + // run all page setup functions in parallel + const results = await Promise.allSettled([ + // loadAuthStorage(page, page_state, {apply: true}), + startMetadataRecording(page, page_state), + setupURLRewriting(page, page_state), + startResponseSaving(page, page_state), + saveYTDLP(page, page_state), + saveGALLERYDL(page, page_state), + // saveSourceMaps(page, page_state), + ]); + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + if (rejected.length) console.warn('[âš ī¸] Parial failures during page setup:', rejected) + } catch(err) { + console.warn('[❌] ERROR DURING PAGE SETUP', JSON.stringify(err, null, 4)) + return + } + + // load the url in the background page, then switch to it once its loaded and close the original tab + console.log('[âžĄī¸] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset) + const startrecording_promise = startScreenrecording(page, page_state) + page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000}) + + // for debugging + globalThis.page = page + globalThis.page_state = page_state + + // start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race) + try { + const results = await Promise.allSettled([ + startrecording_promise, + page.bringToFront(), + old_page.close(), + page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}), + ]) + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + if (rejected.length) console.warn('[âš ī¸] Parial failures during [age load:', rejected) + } catch(err) { + console.warn('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4)) + return + } + + if (page_state.main_response === null) { + page_state.main_response = await page.waitForResponse(() => true) + } + assert(page_state.main_response) + if (page_state.main_response.status() == 429) { + throw `[âš ī¸] Got 429 rate-limit response, skipping this URL for now...` + } + + // resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding + try { + await client.send('Page.enable'); + await client.send('Page.setWebLifecycleState', {state: 'active'}); + await client.send('Runtime.runIfWaitingForDebugger') + } catch(err) { /* console.warn(err) */ } + + // wait a couple seconds for page to finish loading + await wait(5_000) + + // emulate human browsing behavior + // await disableAnimations(page, page_state); + // await jiggleMouse(page, page_state); + await solveCaptchas(page, page_state); + // await blockRedirects(page, page_state); + // await scrollDown(page, page_state); + // await expandComments(page, page_state); + await submitForm(page, page_state); + // await blockJSExecution(page, page_state); + await stopMetadataRecording(page, page_state) // stop tampering with page requests & JS + + console.log('[3/4]-------------------------------------------------------------------------') + + // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff + const saveScreenrecording_promise = saveScreenrecording(page, page_state); + await saveScreenshot(page, page_state); + await savePDF(page, page_state); + + console.log('[4/4]-------------------------------------------------------------------------') + + // do all async archiving steps that can be run at the same time + await inlineShadowDOM(page, page_state); + const results = await Promise.allSettled([ + saveTitle(page, page_state), + saveSEO(page, page_state), + saveFavicon(page, page_state), + saveSSL(page, page_state), + saveRequests(page, page_state), + saveRedirects(page, page_state), + saveHeaders(page, page_state), + saveRaw(page, page_state), + saveDOM(page, page_state), + saveBodyText(page, page_state), + // savePandoc(page, page_state), + saveReadability(page, page_state), + saveAccessibility(page, page_state), + saveOutlinks(page, page_state), + // saveAuthStorage(page, page_state), + saveAIQualityAssuranceResult(page, page_state), + ]); + + // do all sync archiving steps that require browser extensions at the very end (they are the buggiest) + const bg_results = Promise.allSettled([ + saveScreenrecording_promise, + saveSinglefile(page, page_state), + // saveArchiveWebPage(page, page_state), + // savePocket(page, page_state), + ]) + + const {duration} = await saveMetrics(page, page_state); + + const rejected = results + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + + if (rejected.length) + console.warn('[âš ī¸] Parial failures during page archiving:', rejected) + + // Start an interactive REPL here with the `page` instance. + // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl + // await page.repl() + // await page.browser().repl() + + console.log(`[✅] Finished archiving in ${duration/1000}s.`,) + + // await page.tracing.stop(); + try { + const rejected = (await bg_results) + .filter(result => result.status === 'rejected') + .map(result => (result as PromiseRejectedResult).reason) + if (rejected.length) + console.warn('[âš ī¸] Parial failures during page wrap-up tasks:', rejected) + } catch(err) { + console.log(err) + } + await symlinkBestSnapshotResults(snapshot_dir) +} + + +/******************************************************************************/ +/************************* Page Setup Tasks ***********************************/ + + + +async function setupSnapshotDir({original_url, start_time, snapshot_dir=null}) { + // setup archive/ snapshot output folder, move old files into versions//* + clear any existing symlinks + + const snap_dir = snapshot_dir || TASK_PATH(original_url) + + console.log() + console.log() + console.log(ANSI.blue + original_url + ANSI.reset) + console.log(ANSI.black + snap_dir + ANSI.reset) + console.log() + console.log('[📂] Setting up Snapshot output directory...'.padEnd(82), prettyPath(snap_dir)) + + // check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425 + const hacky_dir = path.join(ARCHIVE_DIR, `1999999999.${hashCode(original_url)}`) + const known_dir = SNAPSHOT_DIRS_BY_URL[original_url] + + const known_dir_exists = fs.existsSync(known_dir) + const hacky_dir_exists = fs.existsSync(hacky_dir) + + if (snap_dir == hacky_dir) { + if (known_dir_exists) { + throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!` + } + } else if (snap_dir == known_dir) { + if (hacky_dir_exists) { + throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!` + } + } else { + if (known_dir_exists) { + throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!` + } else if (hacky_dir_exists) { + throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!` + } else { + throw `Tried to create snapshot in ${snap_dir} but its not a recognized snapshot dir path:\n - ${known_dir}\n - ${hacky_dir}` + } + } + + // mkdir -p ./data/archive//versions && cd ./data/archive/ + await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true}) + process.chdir(snap_dir) + + // clear any /data/archive//*.* symlinks pointing to existing ./versions//*.* files + await clearSnapshotDirSymlinks(snap_dir) + + // move /data/archive//*.* loose output files from any prior run into ./versions//*.* + await collectSnapshotDirVersionFiles(snap_dir) + + // update /data/indexes//* to include references to /data/archive/ as-needed + await updateSnapshotDirIndexes(snap_dir, {original_url, start_time}) + + // assert /data/archive// contains no invalid/partial files + is empty/ready to receive new files + await assertSnapshotDirIsValid(snap_dir, {is_empty: true}) + + return snap_dir +} + +// ./index/ : index_getter(page_state) => "" +const INDEXES = { + snapshots_by_day: ({start_time}) => + versionStrFromDate(start_time, {withDate: true, withTime: false}), + snapshots_by_domain: ({original_url}) => + (new URL(original_url)).hostname || '', // hostname does not include :port +} + +async function updateSnapshotDirIndexes(snap_dir, page_state, indexes=INDEXES, indexes_dir=INDEXES_DIR) { + assert(indexes) + console.log(`[🔎] Linking Snapshot in indexes (${Object.keys(indexes).join(', ')})...`) + // const {snapshot_dir, original_url, start_ts} = page_state + for (const [index_name, index_key_getter] of Object.entries(indexes)) { + const index_entry = await indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir}, page_state) + } +} + +async function indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir=INDEXES_DIR}, page_state) { + // place symlinks to this snapshot in any /indexes// -> ./archive/ symlink + const {symlink_abspath} = await overwriteSymlink(snap_dir, symlink_path, {relative: true, mkdirs: false}) +} + + +async function collectSnapshotDirVersionFiles(snap_dir) { + // move archive//*.* snapshot output files into archive//versions//* dated version folder + + // detect start time / version info from previous result metrics.json + const snap_id = snap_dir.split('/archive/').at(-1) + const existing_metrics = path.join(snap_dir, 'metrics.json') + let {start_time, VERSION} = {start_time: '1970-01-01T00:00:00.000Z', VERSION: '19700101000000'} + try { + ;({start_time, VERSION} = JSON.parse(await fs.promises.readFile(existing_metrics, 'utf-8'))); + } catch(err) { + // continue normally, overwriting existing files is fine if they're broken to begin with + } + + // create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output) + const version_dir_name = VERSION || versionStrFromDate(start_time) + const version_dir = path.join(snap_dir, 'versions', version_dir_name) + await fs.promises.mkdir(version_dir, {recursive: true}) + + // move all result files from snapshot_dir root into version folder + const existing_snapshot_files = + (await fs.promises.readdir(snap_dir, {withFileTypes: true})) + .filter(dirent => { + if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them + if (dirent.name == 'versions') return false // dont try to move versions folder into itself + if (dirent.isSymbolicLink()) return false // skip existing symbolic links + return (dirent.isFile() || dirent.isDirectory()) // dont try to version sockets/FIFOs/devs etc. + }) + + if (existing_snapshot_files.length) { + console.log(`[📅] Moving snapshot results into version dir: ./data/archive/${snap_id}/* ->`.padEnd(82), `./data/archive/${snap_id}/versions/${VERSION}/`) + } + + const snapshot_files = await getDirInfo(snap_dir, {withRoot: false, filter: ({relpath}) => !relpath.startsWith('versions')}) + const version_files = await getDirInfo(version_dir, {withRoot: false}) + + for (const {name} of existing_snapshot_files) { + const snapdir_entry_abspath = path.join(snap_dir, name) + const versioned_entry_abspath = path.join(version_dir, name) + + const snapshot_entry = snapshot_files[name] + const version_entry = version_files[name] + + if (snapshot_entry && version_entry) { + // a conflicting file/dir already exists in the destination path + // we have a few options here, we can try to merge them, or we can create a new version + + if (snapshot_entry.sha256 == version_entry.sha256) { + // both are the same already, delete the duplicate (leaving the copy inside the version dir) + // if (snapshot_entry.is_dir) { + // await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true}) + // } else { + // await fs.promises.unlink(snapshot_entry.abspath) + // } + // console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`) + } else { + // both are different, + if (snapshot_entry.num_bytes > version_entry.num_bytes) { + // snapshot entry is bigger, keep it and delete version entry? + } else { + // version entry is bigger, keep it and delete snapshot entry + } + console.warn(' ', snapshot_entry.summary) + console.warn(' ', version_entry.summary) + // throw `Found conflicting duplicate files with different contents: ${name}` + } + } else { + // mv ./data/archive//example.txt -> ./data/archive//versions//example.txt + await fs.promises.rename(snapdir_entry_abspath, versioned_entry_abspath) + console.log(` â†Ŗ ${prettyPath(snapdir_entry_abspath)} ->`.padEnd(82), prettyPath(versioned_entry_abspath)) + } + } +} + +// Extractor definition +// { +// phase: setup | load | sync1 | async1 | sync2 | close +// name: 'media' | 'photos', 'wget', 'singlefile' +// +// shouldRun(page, page_state) + + // pageSetup + // pageLoad + // pageInteraction clicking around/scrolling + // archivePhase1 sync + // archivePhase2 async + // archivePhase3 async + // pageClose + +// execute(page, page_state) +// validateResult(page, page_state) +// } + +async function clearSnapshotDirSymlinks(snap_dir) { + // delete all archive//* symlinks in preparation for new snapshot output to be placed there + + const existing_symlinks = + (await fs.promises.readdir(snap_dir, {withFileTypes: true})) + .filter(dirent => { + if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them + if (dirent.name == 'versions') return false // dont try to move versions folder into itself + return dirent.isSymbolicLink() + }) + + for (const {name: existing_symlink} of existing_symlinks) { + await fs.promises.unlink(path.join(snap_dir, existing_symlink)) + // if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders + // e.g. screerecording saves to ./media which could be pointing to previous version's ./versions//media + } +} + +async function symlinkBestSnapshotResults(snap_dir) { + // move any existing files into versions/ folder (clear out main folder) + // symlink latest files from versions//* into main folder + + await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true}) + process.chdir(snap_dir) + + const metrics_file = path.join(snap_dir, 'metrics.json') + // if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) { + // console.warn('[âš ī¸] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir) + // } + + // move output files into versioned folder + await collectSnapshotDirVersionFiles(snap_dir) + + // clear any existing symlinks + await clearSnapshotDirSymlinks(snap_dir) + + // assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid + await assertSnapshotDirIsValid(snap_dir, {is_empty: true}) + + + const version_dirs = (await fs.promises.readdir(path.join(snap_dir, 'versions'))).sort() // earliest to latest + const most_recent = version_dirs.at(-1) + + // for each version dir in versions/ (oldest -> newest) + for (const version_dir of version_dirs) { + if (version_dir.startsWith('.')) continue + + const version_dir_abspath = path.join(snap_dir, 'versions', version_dir) + const version_dir_files = ( + (await fs.promises.readdir(version_dir_abspath)) + .filter(filename => !filename.startsWith('.'))) + + // iterate through all the files/folders in the version dir + for (const filename of version_dir_files) { + const snapdir_entry = path.join(snap_dir, filename) // ./data/archive//filename + const versiondir_entry = path.join(snap_dir, 'versions', version_dir, filename) // ./data/archive//versions//filename + + if (fs.existsSync(snapdir_entry)) { + // if an entry already exists in the snapshot root for this filename + if ((await fs.promises.lstat(snapdir_entry)).isSymbolicLink()) { + // if a symlink already exists in the root with the same name, + // check if the version file we're looking at is a better candidate to replace it + + const existing_abspath = await fs.promises.realpath(snapdir_entry) + const desired_abspath = path.join(version_dir_abspath, filename) + if (existing_abspath != desired_abspath) { + // check if the new candidate is larger or if the existing symlink is larger (largest file = most likely to be highest quality capture data) + const largest_path = await getLargestPath(existing_abspath, desired_abspath) + if (largest_path != (await fs.promises.realpath(existing_abspath))) { + const larger_version = path.basename(path.dirname(largest_path)) + const larger_abspath = path.join(snap_dir, 'versions', larger_version, filename) + + // console.log(' - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1)) + await overwriteSymlink(larger_abspath, snapdir_entry, {search_limit: snap_dir}) + } else { + // console.log(' - leaving larger file:', largest_path.split('/archive/').at(-1)) + } + } else { + // leave existing symlink pointing to current version file, nothing to change + // console.log(' - leaving current file:', existing_abspath.split('/archive/').at(-1)) + } + } else { + // clearSnapshotDirSymlinks() should have already cleared these files out! + throw `Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${prettyPath(snapdir_entry)}` + } + } else { + // no entry exists in the snapshot root for this filename, create one by linking to the version file + await overwriteSymlink(versiondir_entry, snapdir_entry, {search_limit: snap_dir}) + } + // if (version_dir == most_recent) { + // // only log most recent links even though we link older ones too (otherwise its too noisy) + // console.log(` 🔗 ./${filename} -> ./${versiondir_entry} linking...`) + // } + } + } + + return snap_dir +} + +async function assertSnapshotDirIsValid(snap_dir, {is_empty=false}={}) { + process.chdir(snap_dir) + console.log() + console.log(`[â˜‘ī¸] Checking that snapshot records are valid...`) + + // get all directory entries in archive//* + const snapshot_dir_entries = + (await fs.promises.readdir(snap_dir, {withFileTypes: true})) + .filter(dirent => { + if (dirent.name.startsWith('.')) return false + if (dirent.name == 'versions') return false + }) + + // assert versions folder exists and is not a symbolic link + const versions_dir = path.join(snap_dir, 'versions') + assert(fs.existsSync(versions_dir)) + assert(!(await fs.promises.lstat(versions_dir)).isSymbolicLink()) + + // if it should be empty, check that no loose files exist + if (is_empty) { + assert(!snapshot_dir_entries.length, `Found loose files in snapshot-dir that shouldn't be there! ${snap_dir}`) + } + + // assert all non-hidden files in snapshot dir are symbolic links to actual data in versions//* + for (const snapshot_dir_entry of snapshot_dir_entries) { + if (snapshot_dir_entry.name.startsWith('.')) continue + if (snapshot_dir_entry.name == 'versions') continue + assert(snapshot_dir_entry.isSymbolicLink(), `Found non-symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`) + assert(fs.existsSync(snapshot_dir_entry.name), `Found broken symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`) + } + + const version_entries = ( + (await fs.promises.readdir(versions_dir)) + .filter(foldername => !foldername.startsWith('.')) + .sort()) + + console.log(` √ ${prettyPath(versions_dir)}`, version_entries.length) + + for (const version_dir of version_entries) { + await assertVersionDirIsValid(path.join(versions_dir, version_dir)) + } + + // write snapshot dir file listing w/ sizes & hashes to .files.json + const directory_info = await getDirInfo(snap_dir, {withRoot: true, withHelpers: false, maxdepth: 3}) + await overwriteFile(path.join(snap_dir, '.files.json'), directory_info) +} + +async function assertVersionDirIsValid(version_dir) { + const dirname = path.parse(version_dir).name + assert(fs.existsSync(version_dir), `Version dir does not exist: ${prettyPath(version_dir)}`) + + const dirent = await fs.promises.lstat(version_dir) + assert(dirent.isDirectory() && !dirent.isSymbolicLink(), `Found non-directory in versions dir! ${prettyPath(version_dir)}`) + + const unix_epoch = '19700101000000' + const is_name_valid_datestr = /^\d+$/.test(dirname) && (dirname.length == 14) && (dirname.startsWith('2') || dirname == unix_epoch) && parseVersionDateStr(dirname) + assert(is_name_valid_datestr, `Version directories must be a 14-character long date string like 20251231235959! ${dirname}`) + + // get all directory entries in archive//versions//* + const version_dir_entries = ( + (await fs.promises.readdir(version_dir, {withFileTypes: true})) + .filter((dirent) => !dirent.name.startsWith('.'))) + + // assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs) + for (const version_dir_entry of version_dir_entries) { + assert(version_dir_entry.name != 'versions', `Version dir cannot contain another versions folder! ${prettyPath(version_dir)}/versions`) + assert(!version_dir_entry.isSymbolicLink(), `Version dir cannot contain symbolic link! ${prettyPath(version_dir)}/${version_dir_entry.name}`) + } + + // color highlight the unix epoch version in black, and any version created today in blue + let pretty_dirname = dirname + if (dirname == unix_epoch) { + pretty_dirname = ANSI.black + unix_epoch + ANSI.reset + } + const today = versionStrFromDate(new Date(), {withDate: true, withTime: false}) + if (dirname.startsWith(today)) { + pretty_dirname = ANSI.blue + dirname + ANSI.reset + } + + // write version dir file listing w/ sizes & hashes to .files.json + const directory_info = await getDirInfo(version_dir, { withRoot: true, withHelpers: false, maxdepth: 3 }) + await overwriteFile(path.join(version_dir, '.files.json'), directory_info) + + console.log(` √ ./versions/${pretty_dirname} contains`, version_dir_entries.length, 'results') +} + +async function setupSnapshotDB({ original_url, start_time, snapshot_dir }) { + // setup Snapshot database row, finding it if it already exists or creating a new one + + const timestamp = snapshot_dir.split('/').at(-1) + const search_attrs = { url: original_url, timestamp } + const update_attrs = { url: original_url, timestamp, added: start_time, title: null } + + let snapshot = await Snapshot.findOne({ where: search_attrs }); + let created = false + if (!snapshot) { + snapshot = await Snapshot.findOne({ where: {url: original_url} }); + if (snapshot) { + // console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`) + // throw 'Snapshot DB record does not match filesystem path!' + } else { + console.log(`[+] Creating new DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) for ${prettyPath(snapshot_dir)}...`) + // ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs })); + // throw 'Wanted to create new Snapshot but refusing to modify DB during testing!' + } + } + + // assert(snapshot && (snapshot instanceof Snapshot)) + return snapshot +} + +async function setupViewport(page, _page_state) { + // setup viewport + await page.setViewport(DEFAULT_VIEWPORT); + await page.setGeolocation(DEFAULT_GEOLOCATION); + // await page.setBypassCSP(true); // bypass CSP restrictions (requires --disable-web-security) + page.setDefaultTimeout(DEFAULT_TIMEOUT); + + // Optional: emulate a mobile device + // await page.emulate(puppeteer.devices['iPhone 6']); + + // Configure light mode/dark mode & accessibility reduced motion preferences + await page.emulateMediaFeatures([ + {name: 'prefers-color-scheme', value: DEFAULT_COLOR_SCHEME}, + {name: 'prefers-reduced-motion', value: 'reduce'}, + ]); + + // Setup headers & deterministically chose a random referrer based on URL + const rand_idx = hashCode(await page.url()) % DEFAULT_REFERRERS.length + await page.setExtraHTTPHeaders({ + ...DEFAULT_HEADERS, + referrer: DEFAULT_REFERRERS[rand_idx], + }) + + // Setup alert to trigger if site tries to sniff whether we are a bot + function sniffDetector() { + const userAgent = window.navigator.userAgent; + const platform = window.navigator.platform; + // @ts-ignore + window.navigator.__defineGetter__('userAgent', function () { + // @ts-ignore + window.navigator.sniffed = true; + return userAgent; + }); + // @ts-ignore + window.navigator.__defineGetter__('platform', function () { + // @ts-ignore + window.navigator.sniffed = true; + return platform; + }); + } + await page.evaluateOnNewDocument(sniffDetector); + // @ts-ignore + const was_sniffed = await page.evaluate(() => (!!window.navigator.sniffed)) + if (was_sniffed) { + console.warn('[âš ī¸] Site tried to sniff if we are a bot! Site may be difficult to archive.') + } + + return page +} + +async function setupModalAutoClosing(page, page_state, {timeout=1_250}={}) { + page.on('dialog', (dialog) => { + console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`) + setTimeout(() => {try { dialog.accept() } catch(err) {}}, timeout); + }) + + // if you expect a file-upload dialog, use this to catch it instead: + // const [fileChooser] = await Promise.all([ + // page.waitForFileChooser(), + // ]); + // await fileChooser.accept(['/tmp/myfile.pdf']); + page.on('close', () => { + try { + page.off('dialog') + } catch(err) {} + }) +} + +async function startScreenrecording(page, page_state, {duration_limit=60, codec='libx264'}={}) { + await fs.promises.mkdir(path.dirname(SCREENRECORDING_PATH(page)), {recursive: true}) + // console.log(`[đŸŽŦ] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page))) + + // alternative: interact with low-level puppeteer screencast API directly + // using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast + // const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)}); + + // alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included + // works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8 + + // alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output) + const recorder = new PuppeteerScreenRecorder(page, { + followNewTab: false, + recordDurationLimit: duration_limit, + // fps: 25, + // ffmpeg_Path: '' || null, + // videoFrame: { + // width: 1024, + // height: 768, + // }, + // videoCrf: 18, + videoCodec: codec, + // videoPreset: 'ultrafast', + // videoBitrate: 1000, + // autopad: { + // color: 'black' | '#35A5FF', + // }, + // aspectRatio: '4:3', + }); + page_state.recorder = recorder + await recorder.start(SCREENRECORDING_PATH(page)) + + page.on('close', async () => {await saveScreenrecording(page, page_state)}); + return page_state +} + +async function startResponseSaving(page, page_state) { + const dir = RESPONSES_PATH(page) + await fs.promises.mkdir(dir, {recursive: true}) + + console.log(`[🌄] Starting raw response bytes recording...`.padEnd(82), prettyPath(dir) + '/') + + // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other + const types_to_save = [ + // 'document', + 'script', + 'stylesheet', + 'font', + 'image', + 'media', + 'xhr', + 'websocket', + ] + + // reset responses index file to empty + const responses_log_path = path.join(dir, 'index.jsonl') + await overwriteFile(responses_log_path, '') + + // add handler to save all image repsonses into output directory + page.on('response', async (response) => { + try { + + const timestamp = versionStrFromDate(new Date(), {withDate: true, withTime: true, withSeconds: true, withMilliseconds: true}) + + if (!page_state.main_response && (response.request().url() == page_state.original_url)) { + // save first response as main page response (if we havent already caught it earlier) + page_state.main_response = response + } + + const status = response.status() + if ((status >= 300) && (status < 500)) { + // console.log('Got bad response from', response.url(), 'to', response.headers()['location']) + return + } + const request = response.request() + const resourceType = request.resourceType() + const url_scheme = (response.url() || request.url()).split(':')[0].toLowerCase() + const method = (url_scheme === 'data') ? 'DATA' : request.method() + + // console.log(' ', resourceType, response.url()) + if (types_to_save.includes(resourceType)) { + // create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path + const resource_type_dir = path.join(dir, resourceType) + const url = new URL(response.url()) + let subdir = resource_type_dir + const url_path = (url.pathname || '').slice(0, 250).endsWith('/') + ? (url.pathname || '').slice(0, 250) + : path.dirname((url.pathname || '').slice(0, 250)) + + // determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.) + if (!URL_SCHEMES_IGNORED.includes(url_scheme)) { + // is a normal http:// or https:// url, use the domain + path to construct subdirectory + subdir = path.join(resource_type_dir, (url.hostname || 'data').slice(0, 250), url_path) + } else if (url_scheme == 'data') { + // is a data:... url, store in ./data subdirectory + subdir = path.join(resource_type_dir, 'data') + } else { + // is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory + const url_path = path.dirname((url.pathname || '').slice(0, 999)) + subdir = path.join(resource_type_dir, url_scheme, (url.hostname || 'data').slice(0, 250), url_path) + } + + // write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json + let abspath = null + let resp_mimetype = null + let extension = '' + let uniq_filename = null + let uniq_abspath = null + let symlink_abspath = null + let responseSha256 = null + try { + await fs.promises.mkdir(path.join(dir, 'all'), {recursive: true}) + try { + await fs.promises.mkdir(subdir, {recursive: true}) + } catch(err) { + subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too + try { + await fs.promises.mkdir(subdir, {recursive: true}) + } catch(err) { + subdir = path.join(resource_type_dir, 'data') + await fs.promises.mkdir(subdir, {recursive: true}) + } + } + ;({abspath: symlink_abspath, resp_mimetype, extension} = await detectFilename({page, response, dir: subdir, resourceType})) + + // responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json + uniq_filename = `${timestamp}__${method}__` + [encodeURIComponent(url.href).slice(0, 64).replaceAll('/', '_').replace(new RegExp(`.${extension}$`), ''), extension].filter(s => s.length).join('.') + uniq_abspath = path.join(dir, 'all', uniq_filename) + + + let bytesBuffer = null + try { + bytesBuffer = await response.buffer() + } catch(err) { + if (String(err).includes("Cannot read properties of undefined (reading 'body')")) { + // not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition + } else { + console.warn('[âš ī¸] Failed to save response bytes for:', response.request().url(), err) + } + } + if (bytesBuffer) { + // write response data into ./all/____. + await overwriteFile(uniq_abspath, bytesBuffer) + + responseSha256 = crypto.createHash('sha256').update(bytesBuffer).digest('hex') + + // write symlink file to .///.../. -> ./all/____. + await overwriteSymlink(uniq_abspath, symlink_abspath, {relative: dir, mkdirs: true, search_limit: dir}) + } + // console.log(' ->', symlink_abspath) + } catch(err) { + // dont do anything for redirectresponses, error responses, etc. + console.warn(err) + } + + const urlSha256 = crypto.createHash('sha256').update(String(request.url())).digest('hex') + // const headersSha256 = crypto.createHash('sha256').update(String(request.headers())) // someday we may want to save headers hashes too + + const truncated_url = (method == 'DATA') ? request.url().slice(0, 128) : request.url() // don't duplicate bytes in data: urls (we already saved them in the file) + + // this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form + await fs.promises.appendFile( + responses_log_path, + JSON.stringify({ + ts: timestamp, + method, + url: truncated_url, + urlSha256, + postData: request.postData(), + response_url: ((method != 'DATA') && (url.href != request.url())) ? url.href : undefined, + status, + resourceType, + mimeType: resp_mimetype, + responseSha256, + path: uniq_abspath?.replace(dir, '.'), + symlink_path: symlink_abspath?.replace(dir, '.'), + extension, + }) + '\n', + 'utf-8', + ) + } + } catch(err) { + // we should never throw hard errors here because there's nothing above us to catch it + // and we dont want to crash the entire CDP session / browser / main node process + console.warn('[❌] Error in response handler (set in startResponseSaving):', err) + } + }); + // handled by stopMetadataRecording(): + // page.on('close', () => { + // page.off('response') + // }) +} + +function dedupeCookies(cookies) { + const len_before = cookies.length + + const allowed_cookie_attrs = ['domain', 'path', 'name', 'value', 'expires', 'sameSite', 'sourceScheme', 'url', 'priority', 'secure', 'httpOnly'] + + const deduped_cookies = {} + for (const cookie of cookies) { + try { + const unique_id = `${cookie.domain}${cookie.path}${cookie.name}` + deduped_cookies[unique_id] = { + ...(deduped_cookies[unique_id] || {}), + ...cookie, + expires: 2147483640, // max allowed expiry time (2038-01-18) + session: false, // make sure cookies dont expire at browser close time + secure: false, // make cookie restrictions more lax (for archiving scripts) + httpOnly: false, // make it easier to tamper with cookies from JS (for archiving scripts) + + // "path": "/", + // "expires": 2147483641, + // "size": 194, + // "httpOnly": false, + // "secure": false, + // "session": false, + // "priority": "High", + // "sameParty": false, + // "sourceScheme": "Secure", + // "sourcePort": 443 + + // and more... https://pptr.dev/api/puppeteer.cookieparam + } as Cookie + + if (!deduped_cookies[unique_id].value) { + delete deduped_cookies[unique_id] + continue + } + if (deduped_cookies[unique_id].name.startsWith('__')) { + // cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806 + deduped_cookies[unique_id].secure = true + deduped_cookies[unique_id].sourceScheme = 'Secure' + } + if (deduped_cookies[unique_id].domain.startsWith('.')) { + deduped_cookies[unique_id].sameParty = false + deduped_cookies[unique_id].domain = deduped_cookies[unique_id].domain.slice(1) + } + + for (const key of Object.keys(deduped_cookies[unique_id])) { + if (!allowed_cookie_attrs.includes(key)) { + delete deduped_cookies[unique_id][key] + } + } + } catch(err) { + console.error('[❌] Failed to parse cookie during deduping', cookie) + throw err + } + } + // console.log(`[đŸĒ] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`) + + return Object.values(deduped_cookies) as Cookie[] +} + +async function loadCookiesTxt() { + const cookies = [] as Cookie[] + return cookies // write-only from chrome -> files for now + + if (fs.existsSync(COOKIES_TXT_PATH)) { + // console.log(`[đŸĒ] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`) + + // Read from to cookies.txt file using tough-cookie + @root/file-cookie-store + const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false}); + cookies_store.getAllCookiesAsync = util.promisify(cookies_store.getAllCookies); + const exported_cookies = await cookies_store.getAllCookiesAsync() + for (const cookie of exported_cookies) { + const cookie_from_tough = cookie.toJSON() + const domain = cookie_from_tough.hostOnly ? `.${cookie_from_tough.domain}` : cookie_from_tough.domain + const cookie_for_puppeteer: Cookie = { + domain, + name: cookie_from_tough.key, + path: cookie_from_tough.path, + value: cookie_from_tough.value, + secure: cookie_from_tough.secure || false, + httpOnly: cookie_from_tough.httpOnly || false, + session: false, + expires: (new Date(cookie_from_tough.expires)).valueOf()/1000, + size: undefined, + } + // console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer) + cookies.push(cookie_for_puppeteer) + } + } +} + +type AuthJSON = { + cookies: Cookie[], + sessionStorage: any, + localStorage: any, +} + +async function loadAuthStorage(page, {client}, {apply=true}={}) { + var { + cookies, + sessionStorage, + localStorage, + }: AuthJSON = {cookies: [], sessionStorage: {}, localStorage: {}} + + if (!LOAD_AUTH_STORAGE) { + // dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile + return {cookies, sessionStorage, localStorage} + } + + if (fs.existsSync(COOKIES_TXT_PATH)) { + try { + cookies = await loadCookiesTxt() + } catch(err) { + console.warn('[âš ī¸] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)') + await fs.promises.rename(COOKIES_TXT_PATH, COOKIES_TXT_PATH + '.corrupted') + } + // console.log(`[đŸĒ] Loading cookies from cookies.txt...`, cookies.length) + } + + if (fs.existsSync(AUTH_JSON_PATH)) { + try { + var { + cookies: auth_json_cookies, + sessionStorage, + localStorage, + } = JSON.parse(await fs.promises.readFile(AUTH_JSON_PATH, 'utf-8')); + cookies = [...cookies, ...auth_json_cookies] + // console.log(`[đŸĒ] Loading cookies from auth.json...`, auth_json_cookies.length) + } catch(err) { + console.warn('[âš ī¸] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)') + await fs.promises.rename(AUTH_JSON_PATH, AUTH_JSON_PATH + '.corrupted') + } + } + + cookies = dedupeCookies(cookies) + + if (apply) { + console.log(`[đŸĒ] Loading stored cookies/localStorage/sessionStorage into session...`, cookies.length) + + // if (cookies?.length) { + // try { + // // try setting all at once first (much faster) + // await page.setCookie(...cookies) + // } catch(err) { + // // if any errors, fall back to setting one-by-one so that individual error can be caught + // for (const cookie of cookies) { + // try { + // await page.setCookie(cookie); + // } catch(err) { + // console.error('[❌] Failed to set cookie', cookie) + // throw err + // } + // } + // } + // } + const origin = await page.evaluate(() => window.location.origin) + + await page.evaluate((savedSessionStorage) => { + for (const [key, value] of Object.entries(savedSessionStorage)) { + sessionStorage[key] = value; + } + }, sessionStorage[origin] || {}); + + await page.evaluate((savedLocalStorage) => { + for (const [key, value] of Object.entries(savedLocalStorage)) { + localStorage[key] = value; + } + }, localStorage[origin] || {}); + + // origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well + // https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer + await page.evaluateOnNewDocument(({sessionStorage, localStorage}) => { + const origin = window.location.origin; + + for (const [key, value] of Object.entries(sessionStorage[origin] || {})) { + window.sessionStorage.setItem(key, value as string) + } + for (const [key, value] of Object.entries(localStorage[origin] || {})) { + window.localStorage.setItem(key, value as string) + } + + }, {sessionStorage, localStorage}); + } + + return {cookies, sessionStorage, localStorage} +} + +async function loadCloudflareCookie(page, {original_url}, {timeout=20_000}={}) { + // make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection + // docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr + + + // alternatives if this stops working: + // - https://github.com/omkarcloud/botasaurus + // - https://github.com/ultrafunkamsterdam/nodriver + // - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass + // - https://github.com/VeNoMouS/cloudscraper + + const query = { url: original_url, cmd: "request.get", maxTimeout: timeout } + try { + const response = await fetch(FLARESOLVERR_API_ENDPOINT, { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify(query), + }); + const data = await response.json(); + + const new_cookies = (data?.solution?.cookies || []).map(cookie => ({ + ...cookie, + 'expires': 2147483640, // overwrite expiration to 32bit maximum timestamp (2038-01-18) + 'secure': false, // cookie value is plain text (not encrypted/encoded) + })) + + if (new_cookies.length) { + console.log(`[â˜‘ī¸] Got Cloudflare bypass cookies (${new_cookies.length}) from FlareSolverr API...`) + await page.setCookie(...new_cookies); + return new_cookies + } else { + const error_str = JSON.stringify(data?.message || data, null, 4) + throw `Bad FlareSolverr Response: ${error_str}` + } + + } catch (error) { + if (JSON.stringify(error).includes('Challenge not detected')) { + console.log('[â˜‘ī¸] Page is accessible without FlareSolverr Cloudflare bypass.') + } else { + console.warn('[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.', error) + } + } + return [] +} + +async function setupURLRewriting(page, page_state) { + await page.setRequestInterception(true); + + const rewrites = URL_REWRITES.sort((a, b) => (a.idx || 0) - (b.idx || 0)) + + page.on('request', interceptedRequest => { + if (interceptedRequest.isInterceptResolutionHandled()) return; + + const original_url = interceptedRequest.url() + + // apply all the rewrites in order to the request URL + let url = original_url + for (const rewrite of rewrites) { + const new_url = url.replace(rewrite.pattern, rewrite.replacement) + // console.log(rewrite, url, new_url) + + // if url is rewritten to an emptystring, abort the request + if (!new_url) { + console.warn('[đŸŸĨ] Request blocked', rewrite.pattern, ':', url) + interceptedRequest.abort() + return + } + else if (new_url && new_url != url) { + // console.warn('[đŸ“ŗ] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url) + console.warn('[đŸ“ŗ] Request rewritten', rewrite.pattern, ':', new_url) + url = new_url + } + } + + if (url == original_url) { + // if url is unchanged, continue request flow as-is + interceptedRequest.continue() + } else { + // otherwise redirect the browser to our rewritten version + interceptedRequest.respond({ + status: 302, + headers: { + location: url, + 'x-redirect-by': 'ArchiveBox.setupURLRewriting', + }, + }) + } + }); + // handled by stopMetadataRecording(): + // page.on('close', () => { + // page.off('request') + // page.setRequestInterception(false) + // }) +} + +async function startMetadataRecording(page, {original_url, version, client, traffic_log, console_log, redirects}) { + // update helper state on page + page._original_url = (original_url || (await page.url())).toString() + + // DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay) + // page._client = client || page._client || await page.target().createCDPSession() + // page._redirects = redirects + // page._traffic_log = traffic_log + + // add initial entry to page redirect log + redirects[original_url] = { + idx: 0, + url: original_url, + src: null, + type: 'Initial', + wallTime: Date.now()/1000, + frameId: page.mainFrame()._id, + requestId: null, + initiator: {type: "user"}, + isMainFrame: true, + } + + // DEBUGGING: record optional chrome debug trace with screenshots (heavy) + // try { + // await page.tracing.stop() + // await wait(200) + // } catch(err) {} + // try { + // await page.tracing.start({path: TRACE_PATH(page), screenshots: true}); + // } catch(err) {} + + let last_main_frame_url = original_url + + // setup network request intercepts handler + const addCDPRequestDataListener = (eventName) => { + client.on(eventName, event => { + try { + // save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on + const new_url = event.documentURL + const http_status = event.redirectResponse?.status || 0 + const is_new_url = (new_url !== original_url) && !redirects[new_url] + const is_main_frame_navigation = (event.frameId == page.mainFrame()._id) + const is_http_redirect = (300 < http_status) && (http_status < 400) + + if (new_url && is_new_url && (is_main_frame_navigation || is_http_redirect) && event.type == 'Document') { + const new_redirect_entry = { + url: new_url, + src: event.redirectResponse?.url || last_main_frame_url, + type: http_status || 'JS', + wallTime: Date.now()/1000, + frameId: event.frameId, + requestId: event.requestId, + initiator: event.initiator, + idx: Object.keys(redirects).length, + isMainFrame: is_main_frame_navigation, + } + redirects[new_url] = new_redirect_entry + if (is_main_frame_navigation) { + ALREADY_ARCHIVED.add(new_redirect_entry.url.slice(0, 4096)) // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination + console.warn(`[âžĄī¸] NAVIGATION[${new_redirect_entry.type}]${ANSI.blue} ${last_main_frame_url} ${ANSI.reset}\n ->${ANSI.blue} ${new_redirect_entry.url} ${ANSI.reset}`) + last_main_frame_url = new_url + } + } + + if (event.loaderId) { + traffic_log[event.loaderId] = traffic_log[event.loaderId] || {} // make sure loader is also in requests list first + // sometimes it's not in the list if we start archiving too late / after a page's initial request was already made + } + + // save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}} + // https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1 + traffic_log[event.requestId] = traffic_log[event.requestId] || {} + Object.assign(traffic_log[event.requestId], { [eventName]: event }) + + // DEBUGGING: log page visits and navigation events to console + // if (event?.response?.status) { + // // if we're expecting an HTML response, then we assume it's a page visit & log it to console + // const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept + // if (acceptMimeType && acceptMimeType.includes('text/html')) { + // // log any HTML page responses (less noisy) + // console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`) + // } else { + // // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy) + // // console.log(` > ${event.response.status} ${event.response.url} (${event.response.mimeType})`) + // } + // } + } catch(err) { + console.warn('[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)') + console.warn(err) + } + }) + } + addCDPRequestDataListener('Network.requestWillBeSent') + addCDPRequestDataListener('Network.requestWillBeSentExtraInfo') + addCDPRequestDataListener('Network.responseReceived') + addCDPRequestDataListener('Network.responseReceivedExtraInfo') + + // clear any existing log entries + const consolelog_info = { + TYPE: 'console', + VERSION: version, + URL: original_url, + } + await overwriteFile(CONSOLELOG_PATH(page), JSON.stringify(consolelog_info) + '\n') + + // record console logs from page + const appendConsoleLog = async (line) => { + if (!line) return + console_log.push(line) + await fs.promises.appendFile( + CONSOLELOG_PATH(page), + line + '\n', + 'utf-8', + ) + } + + page.on('console', async(message) => + await appendConsoleLog(`${message.type().toUpperCase()} ${message.location()} ${JSON.stringify(message.text())}`)) + page.on('pageerror', async (error) => + await appendConsoleLog(error.message || JSON.stringify(error))) + page.on('requestfailed', async (request) => + await appendConsoleLog(`${request.failure()?.errorText} ${request.url() || JSON.stringify(request)}`)) + + // set puppeteer options on page + await client.send('Network.enable') // enable network tampering API + await client.send('Emulation.clearDeviceMetricsOverride'); // clear timing statistics + await client.send('Page.setDownloadBehavior', { + behavior: 'allow', + downloadPath: CHROME_DOWNLOADS_DIR, + }) + + // handled by stopMetadataRecording(): + // page.on('close', () => { + // try { + // page.off('request') + // page.off('console') + // page.off('pageerror') + // page.off('requestfailed') + // page.setRequestInterception(false) + // } catch(err) { + // // some versions of puppeteer have had race conditions here where page is already closed by now + // console.warn('[X] Error in page close handler', err) + // } + // }) + + return {original_url, client, redirects, traffic_log, console_log} +} + +async function stopMetadataRecording(page, _page_state) { + console.log('[đŸĒ] Stopping CDP event hooks and request interception...') + try { + page.off('request') + page.off('response') + page.off('console') + page.off('pageerror') + page.off('requestfailed') + page.off('hashchange') + page.setRequestInterception(false) + // page.tracing.stop() + } catch(err) { + // some versions of puppeteer have had race conditions here where page is already closed by now + console.warn('[X] Error in page close handler', err) + } +} + +/********************** Human Behavior Emulation ******************************/ + +async function solveCaptchas(page, page_state, {timeout=90_000}={}) { + + // using puppeteer-extra-plugin-recaptcha auto-solver + // await page.solveRecaptchas() + + // using 2captcha-solver extension auto-solver + try { + // console.log('[🕑] Waiting for CAPTCHA to appear...') + await page.waitForSelector('.captcha-solver', {timeout: 5_000}) + + console.log('[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...') + await page.click('.captcha-solver') + + console.log(`[🧠] Waiting up to ${timeout/1000}s for CAPTCHA to be solved...`) + await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout}) + + console.log('[🔓] CAPTCHA solution retrieved from 2captcha.') + } catch(err) { + console.log('[â˜‘ī¸] No CATPCHA challenges found, site thinks we are human.') + } +} + +async function jiggleMouse(page, page_state, {timeout=600}={}) { + console.log(`[🐁] Moving mouse around randomly for ${timeout/1000}s...`) + + const randomPoint = await getRandomPagePoint(page) + const cursor = createCursor(page, randomPoint, true) + + cursor.toggleRandomMove(true) + await wait(timeout/2); + await cursor.moveTo({x: DEFAULT_VIEWPORT.width/2, y: DEFAULT_VIEWPORT.height/2}); + await wait(timeout/2); + cursor.toggleRandomMove(false) +} + +async function blockRedirects(page, {original_url}) { + page.on('request', req => { + if (req.isInterceptResolutionHandled()) return; + + // if it's a top-level navigation event to a new url + if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== original_url) { + req.abort('aborted'); + console.warn('[đŸŸĨ] Blocked page attempt to naviage to new URL', req.url()) + } else { + req.continue(); + } + }); + // handled by stopMetadataRecording(): + // page.on('close', () => { + // page.off('request') + // page.setRequestInterception(false) + // }) + await page.setRequestInterception(true); +} + +async function blockJSExecution(page, _page_state) { + console.warn('[đŸŸĨ] Stopping all JS execution on page...') + await page.evaluate(() => { + debugger; + }) + // OR alternatively this (more buggy, breaks many sites): + // const html = await page.content(); + // page.setJavaScriptEnabled(false); + // await page.setContent(html, { waitUntil: 'networkidle0' }); // 4 +} + +async function scrollDown(page, _page_state, {timeout=120_000, scroll_delay=SCROLL_DELAY, scroll_distance=SCROLL_DISTANCE, scroll_limit=SCROLL_LIMIT}={}) { + const starting_height = await page.evaluate('document.body.scrollHeight'); + let last_height = starting_height + + let scroll_count = 0; + let scroll_position = scroll_count * scroll_distance + // await page.bringToFront() + + // scroll to top + await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); }); + + while ((scroll_count < scroll_limit) && ((scroll_delay * scroll_count) < timeout)) { + console.log(`[âŦ‡ī¸] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`) + await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position); + scroll_count++ + scroll_position = scroll_count * scroll_distance + + // check if any new content was added / if we are infiniscrolling + let new_height = await page.evaluate('document.body.scrollHeight') + const added_px = new_height - last_height + if (added_px > 0) { + console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`) + } else if (scroll_position >= new_height + scroll_distance) { + // we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine) + if (scroll_count > 2) + break + } + last_height = new_height + + // sleep 2s, perform the smooth scroll down by 1000px, and increment the counter + await wait(scroll_delay); + + // facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages + if (page._original_url.startsWith('https://www.facebook.com/watch/?v') && scroll_count > 3) break + } + + // scroll to bottom + if (scroll_position < last_height) { + await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); }); + await wait(scroll_delay) + await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); }); + } + + // Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down + console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`) + await wait(scroll_delay); + await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); }); + await wait(scroll_delay); + + return last_height +} + +async function disableAnimations(page, _page_state) { + console.log(`[â›„ī¸] Disabling all animations using CSS override...`) + + // https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer + const css_override = `*, *::before, *::after { + -moz-animation: none !important; + -moz-transition: none !important; + animation: none !important; + transition: none !important; + caret-color: transparent !important; + }` + + // inject override into current page + await page.addStyleTag({content: css_override}); + + // inject override into any subsequently navigated pages + await page.evaluateOnNewDocument((css_override) => { + const style_tag = document.createElement('style') + style_tag.type = 'text/css' + style_tag.innerHTML = css_override + document.getElementsByTagName('head')[0].appendChild(style_tag) + }, css_override); +} + +async function expandComments(page, _page_state, {timeout=120_000, limit=15_000, delay=650}={}) { + console.log(`[đŸ—ƒī¸] Expanding up to ${limit} comments every ${delay}ms...`) + + // expand all
    sections in Github READMEs, HedgeDoc pages, etc. + await page.$$eval('pierce/article details', elem => {elem.open = true}) // expand Github README details sections + await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', elem => {elem.open = true}) // expand Github issue discussion hidden comments + await page.$$eval('pierce/.markdown-body details', elem => {elem.open = true}) // expand HedgeDoc Markdown details sections + + await page.exposeFunction('onHashChange', url => page.emit('hashchange', url)); + await page.evaluateOnNewDocument(() => { + // @ts-ignore + addEventListener('hashchange', (e) => onHashChange(location.href)); + }); + + // Listen for hashchange events in node Puppeteer code. + page.on('hashchange', url => console.log('Page tried to navigate to:', new URL(url))); + + + const num_expanded = await page.evaluate(async ({timeout, limit, delay}) => { + function getElementsByXPath(xpath, ctx?) { + var results = []; + var xpathResult = document.evaluate( + xpath, // e.g. //*[text()='"+text+"'] + ctx || document, + null, + XPathResult.ORDERED_NODE_ITERATOR_TYPE, + null + ); + var node; + while ((node = xpathResult.iterateNext()) != null) { + results.push(node); + } + return results; + } + + let num_expanded = 0 + const getLoadMoreLinks = () => [ + // find all the buttons/links to expand collapsed/hidden/lazy-loaded content + ...document.querySelectorAll('faceplate-partial[loading=action]'), // new reddit + ...document.querySelectorAll('a[onclick^="return morechildren"]'), // old reddit show more replies + ...document.querySelectorAll('a[onclick^="return togglecomment"]'), // old reddit show hidden replies + // ...document.querySelectorAll('a.js-show-link'), // stack overflow comments show more (TODO: make this only work on SO) + // ...document.querySelectorAll('a.morelink'), // HackerNews profile show more (TODO: make this only work on HN) + // ...getElementsByXPath("//*[text()~='View \d+ replies']"), // facebook comment expander + ...getElementsByXPath("//*[text()='Show more replies']"), // twitter infiniscroll expander + ...getElementsByXPath("//*[text()='Show replies']"), // twitter replies expander + ] + const wait = (ms) => new Promise(res => setTimeout(res, ms)) + + let load_more_links = getLoadMoreLinks() + while (load_more_links.length) { + console.log('Expanding comments...', load_more_links.length) + for (const link of load_more_links) { + link.scrollIntoView({behavior: 'smooth'}) + if (link.slot == 'children') { + continue + // patch new reddit "More replies" links that would open in a new window to display inline instead + // const comment_id = link.src.split('?')[0].split('/').at(-1) + // link.slot = `children-${comment_id}-0` + // link.__alwaysShowSlot = false + } + // click the "More replies" button + link.click() + num_expanded++ + await wait(delay) + const time_elapsed = num_expanded * delay + if ((num_expanded > limit) || (time_elapsed > timeout)) + return num_expanded + } + load_more_links = getLoadMoreLinks() + } + return num_expanded + }, {timeout, limit, delay}); + + page.off('hashchange') + + if (num_expanded) { + console.log(`[đŸ—ƒī¸] Expanded ${num_expanded} comments...`) + + // scroll to bottom, then back up to top + const final_height = await page.evaluate('document.body.scrollHeight'); + await page.evaluate((top) => { window.scrollTo({ top, left: 0, behavior: 'smooth' }); }, final_height + 1000); + await wait(delay); + await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); }); + await wait(delay); + } + +} + +async function submitForm(page, _page_state, {timeout=5_000}={}) { + try { + await page.waitForSelector('form button[type=submit]', {timeout: 1_500}); + console.log('[â˜‘ī¸] Submitting form...') + await page.click('form button[type=submit]') + await page.waitForNavigation({timeout}); + await page.goBack(); + } catch (err) { + // no form found + } +} + +// TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless) + +/******************************************************************************/ +/******************************************************************************/ + +/**************** Extension-Based Archive Output Tasks ************************/ + +async function saveSinglefile(page, {main_response, extensions}) { + const extension = extensions.filter(({name}) => name === 'singlefile')[0] + if (!extension.version) throw 'Could not find Singlefile extension ID, is it installed?' + + const url = await page.url() || main_response.url() + if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null + + // get list of existing past files in downloads/* to ignore + const files_before = new Set( + (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) + .filter(fn => fn.endsWith('.html')) + ); + + const out_path = SINGLEFILE_PATH(page) + + console.log(`[đŸ› ī¸] Saving Singlefile HTML using extension (${extension.id})...`.padEnd(82+1), prettyPath(CHROME_DOWNLOADS_DIR)) + await page.bringToFront() // action button acts on the foreground tab, so it has to be in front :( + await extension.dispatchAction() + let files_new = [] + + const check_delay = 3_000 + for (const _try in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) { + await wait(check_delay) + + const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)).filter(fn => fn.endsWith('.html')); + files_new = files_after.filter(file => !files_before.has(file)) + + if (files_new.length == 0) { + // console.warn(` ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`) + continue + } + // iterate through new downloads and find a matching .html containing our page's URL in the header + for (const file of files_new) { + const dl_path = path.join(CHROME_DOWNLOADS_DIR, file) + const dl_text = await fs.promises.readFile(dl_path, 'utf-8') + const dl_header = dl_text.split('meta charset')[0] + if (dl_header.includes(`url: ${url}`)) { + /// dont need this check anymore as now all output is versioned: + // if (fs.existsSync(out_path)) { + // const {size: existingSize} = await fs.promises.stat(out_path) + // const {size: newFileSize} = await fs.promises.stat(dl_path) + // if (newFileSize < existingSize) { + // console.log(`[đŸ—‘ī¸] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`) + // await fs.promises.rm(dl_path) + // return out_path + // } + // } + console.log(`[âœī¸] Moving Singlefile download from ${file}...`.padEnd(82), prettyPath(out_path)) + await fs.promises.rename(dl_path, out_path) + return out_path + } + } + } + + console.warn(`[❌] Couldn't find matching Singlefile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay*10)/1000}s:`, files_new.join(', ')) + return null +} + +async function saveArchiveWebPage(page, {extensions}, {timeout=30_000}={}) { + // TODO: waiting on them to expose commands so we can generate .wacz easily + // https://github.com/webrecorder/archiveweb.page/issues/207 + // ... + const browser = await page.browser() + const extension = extensions.filter(({name}) => name === 'archivewebpage')[0] + await page.bringToFront() + await extension.dispatchPopup() + await extension.dispatchAction() + const popup = await browser.waitForTarget( + target => target.url().toString().startsWith(`chrome-extension://${extension.id}/popup.html`), + {timeout: 5_000}, + ) + await page.bringToFront() + + // await puppeteer.Locator.race([ + // popup.locator('::-p-aria(Start With Autopilot)'), + // popup.locator('wr-popup-viewer >>>> input'), + // popup.locator(':scope >>> input') + // ]) + // .setTimeout(timeout) + // .click({ + // offset: { + // x: 7.7265625, + // y: 7.203125, + // }, + // }); + + // @ts-ignore + await puppeteer.Locator.race([ + popup.locator('wr-popup-viewer >>>> div.status-row > p'), + popup.locator(':scope >>> div.status-row > p'), + popup.locator('::-p-text(Recording: \n)') + ]).setTimeout(timeout).click({ + delay: 733.3000000007451, + offset: { + x: 293, + y: 13.5, + }, + }) + + await wait(8_000) + + // @ts-ignore + await puppeteer.Locator.race([ + popup.locator('wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)'), + popup.locator(':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)'), + popup.locator('::-p-text(Stop)') + ]).setTimeout(timeout).click({ + offset: { + x: 7.859375, + y: 23.203125, + }, + }); + + return null +} + +async function savePocket(page, {extensions}) { + const browser = await page.browser() + const extension = extensions.filter(({name}) => name === 'pocket')[0] + if (!extension.version) throw 'Could not find Pocket extension ID, is it installed?' + + console.log(`[đŸ› ī¸] Saving URL to Pocket API using extension (${extension.id})...`, 'https://getpocket.com/saves') + await page.bringToFront() // action button acts on the foreground tab, so it has to be in front + await extension.dispatchAction() + try { + const login_window = await browser.waitForTarget( + target => target.url().toString().startsWith('https://getpocket.com/'), + {timeout: 3_000}, + ) + // login window will open if pocket is not signed-in + if (login_window) return false + } catch(e) { + // no new window should open if it saves correctly + return true + } +} + +/***************** Synchronous Archive Output Tasks ***************************/ + +async function saveScreenrecording(page, page_state, {save_gif=true}={}) { + if (page_state.recorder) { + const duration = Date.now() - page_state.start_ts + console.log(`[đŸŽĨ] Saving screen-recording video (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page))) + const recorder = page_state.recorder + page_state.recorder = null + await recorder.stop() + + // create symlink for legacy path + const snap_dir = page_state.snapshot_dir + const legacy_path = path.join(snap_dir, 'media', 'screenrecording.mp4') + await overwriteSymlink(SCREENRECORDING_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir}) + + // // remove duplicate frames (white frames at start while it loads + static image at end) + // const video_path = SCREENRECORDING_PATH(page) + // const short_path = video_path.replace('.mp4', '.short.mp4') + // try { + // await exec( + // // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes) + // `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}` + // ) + // } catch(err) { + // console.log('[❌] Failed to shorten screenrecording.mp4') + // } + + // convert video to GIF + if (save_gif) { + try { + const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg' + const child = child_process.spawn( + BIN_NAME, + [ + '-hide_banner', + '-loglevel', 'error', + '-ss', '3', + '-t', '10', + '-y', + '-i', SCREENRECORDING_PATH(page), + '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse", + '-loop', '0', + SCREENRECORDGIF_PATH(page), + ], + { + cwd: path.dirname(SCREENRECORDING_PATH(page)), + timeout: 60_000, + // stdio: [null, 'pipe', 'pipe'], + stdio: 'ignore', + detached: true, // run in background, don't block on response + }, + ) + await blockUntilExists(SCREENRECORDGIF_PATH(page), {min_bytes: 100, timeout: 40_000}) + console.log(`[đŸŽĨ] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDGIF_PATH(page))) + + const snap_dir = page_state.snapshot_dir + const legacy_path = path.join(snap_dir, 'media', 'screenrecording.gif') + await overwriteSymlink(SCREENRECORDGIF_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir}) + } catch(err) { + console.log('[❌] Failed to convert video to GIF:', err) + } + } + + return SCREENRECORDING_PATH(page) + } + return null +} + +async function saveScreenshot(page, _page_state, {aspect_ratio=SCREENSHOT_ASPECT_RATIO, width=null, height=null, jpg_width=1440, jpg_quality=90, timeout=30_000}={}) { + try {await fs.promises.unlink(SCREENSHOT_PATH(page))} catch(err) {} + + // setup width and height + width = width || DEFAULT_VIEWPORT.width + assert((typeof width === 'number') && width > 200) + height = height || Math.floor(width/aspect_ratio) + assert((typeof height === 'number') && height > 200) + + console.log(`[📸] Saving full-page screenshot (${width}x${height}px)...`.padEnd(82), prettyPath(SCREENSHOT_PATH(page))) + + // set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576 + await page.setViewport({ ...DEFAULT_VIEWPORT, width, height, deviceScaleFactor: 2}) + await page.bringToFront() + await wait(1_250) // page takes a sec settle after foregrounding and viewport update + + // take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png + await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true, type: 'png' }) + + // wait for the screenshot to be created, then set the viewport to the next size + await blockUntilExists(SCREENSHOT_PATH(page), {min_bytes: 100, timeout}) + await wait(6_000) // puppeteer takes a while to finish writing png data when fullPage: true + + const jpg_height = Math.floor(jpg_width/aspect_ratio) + await page.setViewport({ ...DEFAULT_VIEWPORT, width: jpg_width, height: jpg_height, deviceScaleFactor: 2}) + await wait(1_250) // page takes a sec settle after foregrounding and viewport update + + // WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots) + // thats why there are all these delays here. + // screenshot creation messes up the whole viewport while it's running, + // and it writes bad/white empty screenshots if you try to make more than one concurrently + + // take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg + await page.screenshot({ + path: SCREENSHOT_JPG_PATH(page), + type: 'jpeg', + quality: jpg_quality, + clip: { + x: 0, + y: 0, + width: jpg_width, + height: jpg_height, + }, + captureBeyondViewport: false, + }); + await blockUntilExists(SCREENSHOT_JPG_PATH(page), {min_bytes: 100, timeout: timeout/2}) + console.log(`[📸] Saved screenshot as screenshot.jpg (${jpg_width}x${jpg_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page))) + + // reset viewport back to defaults + await wait(1_250) + await page.setViewport(DEFAULT_VIEWPORT) + + // ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually: + // import {PNG} from 'pngjs'; + // import jpeg from 'jpeg-js'; + // setTimeout(async () => { + // try { + // const screenshot_png = SCREENSHOT_PATH(page); + // const screenshot_jpg = SCREENSHOT_JPG_PATH(page) + // const jpg_max_height = height + // const jpg_quality = quality; // Adjust the quality as needed (0-100) + + // fs.createReadStream(screenshot_png) + // .pipe(new PNG()) + // .on('parsed', function () { + // const width = this.width; + // const height = this.height; + + // let cropped_height = height; + // if (height > jpg_max_height) { + // cropped_height = jpg_max_height; + // } + + // const cropped_bytes = new Uint8Array(width * cropped_height * 4); + // for (let y = 0; y < cropped_height; y++) { + // for (let x = 0; x < width; x++) { + // const idx = (width * y + x) << 2; + // cropped_bytes[idx] = this.data[idx]; + // cropped_bytes[idx + 1] = this.data[idx + 1]; + // cropped_bytes[idx + 2] = this.data[idx + 2]; + // cropped_bytes[idx + 3] = this.data[idx + 3]; + // } + // } + + // const jpeg_obj = { + // data: cropped_bytes, + // width: width, + // height: cropped_height, + // }; + + // const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality); + // fs.writeFileSync(screenshot_jpg, jpeg_bytes.data); + // console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page))) + // }); + // } catch(err) { + // console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err) + // } + // }, DELAY_BEFORE_JPG_CONVERSION) + + // ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG: + // await wait(5_000) // puppeteer takes a while to finish writing png data when fullPage: true + // if ((await page.evaluate('document.body.scrollHeight')) > max_height) { + // // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png + // // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere) + // await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100}) + // await wait(1_000) // page takes a sec settle after a screenshot + // } + + return SCREENSHOT_PATH(page) +} + +async function savePDF(page, _page_state, {timeout=30_000}={}) { + const url = page.url() || 'about:blank' + if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null + + const out_path = PDF_PATH(page) + console.log(`[📓] Saving print-as-PDF export...`.padEnd(82), prettyPath(out_path)) + await page.bringToFront() + try {await fs.promises.unlink(PDF_PATH(page))} catch(err) {} + + // await page.emulateMediaType('screen') // print as "@media(screen) instead of @media(print)" + + // page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing + // (streams to disk in chunks instead of all at once) + const pdf_stream = await page.createPDFStream({ + timeout: timeout, + printBackground: true, + outline: true, + tagged: true, + format: 'A4', + displayHeaderFooter: false, + // margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' }, + }) + const reader = pdf_stream.getReader() + + // iterate through reader and append chunks to out_path + await fs.promises.rm(out_path, {force: true}) + let num_bytes = 0 + let error = '0 bytes written' + try { + while (true) { + const {done, value} = await reader.read() + if (done) break; + await fs.promises.appendFile(out_path, value) + num_bytes += value.length; + } + } catch(error) { + num_bytes = 0 + } + + if (!num_bytes) { + console.warn('[❌] Failed to save PDF', JSON.stringify(error, null, 4)) + await fs.promises.rm(out_path, {force: true}) + return null + } + + return out_path +} + +async function inlineShadowDOM(page, _page_state, {limit=100_000}={}) { + console.log(`[😎] Replacing Shadow DOM elements with inline HTML...`) + + try { + const num_replaced = await page.evaluate((limit) => { + let num_replaced = 0 + + // Returns HTML of given shadow DOM. + const getShadowDomHtml = (shadowRoot) => { + let shadowHTML = ''; + for (const el of shadowRoot.childNodes) { + shadowHTML += el.nodeValue || el.outerHTML; + } + return shadowHTML; + }; + + // Recursively replaces shadow DOMs with their HTML. + const replaceShadowDomsWithHtml = (rootElement) => { + if (num_replaced > limit) return + for (const el of rootElement.querySelectorAll('*')) { + if (el.shadowRoot) { + replaceShadowDomsWithHtml(el.shadowRoot); + el.innerHTML += getShadowDomHtml(el.shadowRoot); + } + } + num_replaced++ + }; + + replaceShadowDomsWithHtml(document.body); + + return num_replaced + }, limit) + // console.log(' √ replaced', num_replaced, 'Shadow DOM trees') + } catch(err) { + console.log('[âš ī¸] Inlining Shadow DOM failed', err) + } +} + +async function saveAIQualityAssuranceResult(page, {original_url, version}) { + console.log(`[🧠] Analyzing screenshot with GPT-4o for QA checks...`.padEnd(82), prettyPath(AIQA_PATH(page))) + + let screenshot_path = SCREENSHOT_PATH(page) + const screenshot_cropped_path = SCREENSHOT_JPG_PATH(page) + + if (fs.existsSync(screenshot_cropped_path)) { + // screenshot is too tall to pass to openai, send cropped version instead + screenshot_path = screenshot_cropped_path + } + try { + await blockUntilExists(screenshot_path, {min_bytes: 100, timeout: 7_500}) + } catch (err) { + console.warn('[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists', err) + return null + } + var stdout = '' + var stderr = '' + let result = null + const PYTHON_BIN = path.join(__dirname, '.venv/bin/python') + const SCRIPT_PATH = path.join(__dirname, 'ai_qa.py') + await blockUntilExists(PYTHON_BIN, {min_bytes: 1, timeout: 250}) + await blockUntilExists(SCRIPT_PATH, {min_bytes: 1, timeout: 250}) + + try { + var {stdout, stderr} = await exec( + `${PYTHON_BIN} ${SCRIPT_PATH} --attach '${screenshot_path}'` + ) + result = JSON.parse(stdout.toString()) + if (!result) throw 'Got empty result!' + result = { + TYPE: 'aiqa', + VERSION: version, + URL: original_url, + ...result, + } + } catch(parse_err) { + console.warn('[❌] Failed to get OpenAI analysis for screenshot.png', parse_err, stderr) + } + if (!(result || stdout)) { + return null + } + await overwriteFile( + AIQA_PATH(page), + result || stdout.toString(), + ) + + + + return result +} + +async function saveYTDLP(page, {original_url, version}, {max_size='750m'}={}) { + console.log(`[đŸŽĨ] Saving media with YT-DLP (<=${max_size})...`.padEnd(82), prettyPath(YTDLP_PATH(page))) + + await fs.promises.mkdir(YTDLP_PATH(page), {recursive: true}) + + const cwd = YTDLP_PATH(page) + const bin_name = 'yt-dlp' + const timeout = 300_000 // 5min timeout + const args = [ + '--restrict-filenames', + '--trim-filenames', '128', + '--write-description', + '--write-info-json', + '--write-annotations', + '--write-thumbnail', + '--no-call-home', + '--write-sub', + '--write-auto-subs', + '--convert-subs=srt', + '--yes-playlist', + '--continue', + '--no-abort-on-error', + '--ignore-errors', + '--geo-bypass', + '--add-metadata', + `--format=(bv*+ba/b)[filesize<=${max_size}][filesize_approx<=?${max_size}]/(bv*+ba/b)`, + '--no-check-certificate', + '--no-progress', + // `--cookies=${COOKIES_TXT_PATH}`, // using logged in cookies actually makes it fail more often, not sure why + original_url, + ] + + const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout}) + + return {getResult, ...exec_info} +} + +async function saveGALLERYDL(page, {original_url, version}) { + console.log(`[đŸŽĨ] Saving photos with gallery-dl...`.padEnd(82), prettyPath(GALLERYDL_PATH(page))) + + await fs.promises.mkdir(GALLERYDL_PATH(page), {recursive: true}) + + const cwd = GALLERYDL_PATH(page) + const bin_name = 'gallery-dl' + const timeout = 300_000 // 5min timeout + const args = [ + '--verbose', + '--write-metadata', + '--write-infojson', + '--write-tags', + '--sleep=1.5-2.5', + `--cookies=${COOKIES_TXT_PATH}`, + // '--no-check-certificate', + // `--directory=media`, + original_url, + ] + + const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout}) + + return {getResult, ...exec_info} +} + +// async function saveWget(page, {original_url, version}) { +// console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page))) + +// const args = [ +// // ... +// ] + +// spawn( +// 'wget', +// [ +// ...args, +// original_url, +// ], +// { +// cwd: WGET_PATH(page), +// detached: true, // run in background, don't block on response +// stdio: 'ignore', +// timeout: 300_000, // 5min timeout +// }, +// ) + +// return {path: WGET_PATH(page)} +// } + +/**************** Asynchronous Archive Output Tasks ***************************/ + +type FaviconCandidate = { + url: string, + basename: string, + extension: string, + expected_mimetype: string, +} + +const faviconFromDomain = (url) => { + // https://auth:pass@t.co:1234/a/bc123 -> https://auth:pass@t.co:1234/favicon.ico + const url_origin = (new URL(url)).origin + return { + url: url_origin ? `${url_origin}/favicon.ico` : null, + basename: 'favicon', + extension: undefined, // auto-detect extension at download time in case it redirects us to a png + expected_mimetype: 'image/', // only accept image/* to avoid saving html/txt error reponses as icon + } as FaviconCandidate +} + +const faviconFromGoogle = (url, size=256) => { + // https://auth:pass@t.co:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co + const domain = url && (new URL(url)).hostname + return { + url: domain?.includes('.') ? `https://www.google.com/s2/favicons?sz=${size},domain=${domain}` : null, + basename: 'google_favicon', + extension: 'png', + expected_mimetype: 'image/png', // google always provides PNGs in response + } as FaviconCandidate +} + +const faviconFromHtml = async (page) => { + // -> https://example.com/static/images/favicon.png + let url + try { + url = await page.$eval('link[rel*="icon"]', (elem) => elem?.href) + if (!url || !url.includes('://')) + url = null + } catch(err) { + url = null + // console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4)) + } + + return { + url, + basename: 'favicon', + extension: undefined, // auto-detect extension at download time + expected_mimetype: 'image/', // accept any image/* mimetype at download time + } as FaviconCandidate +} + +type FaviconResult = { + url: string, + num_bytes: number, + abspath?: string, + dir?: string, + filename?: string, + mimeType?: string, +} + +async function saveFavicon(page, {original_url, main_response, version}) { + const dir = path.dirname(FAVICON_PATH(page)) + const response_url = main_response?.url() + + const favicon_downloads_to_try: {[key: string]: FaviconCandidate} = unique([ + await faviconFromHtml(page), + faviconFromDomain(response_url), + faviconFromDomain(original_url), + faviconFromGoogle(response_url), + faviconFromGoogle(original_url), + ].filter(({url}) => url), 'url') + + const browser = await page.browser() + + // let logs = [] + // let errors = [] + let output_files: {[key: string]: FaviconResult} = {} + + for (const download_options of Object.values(favicon_downloads_to_try)) { + let result: FaviconResult = {num_bytes: 0, url: download_options.url} + // {url, num_bytes, abspath, dir, filename, basename, extension, mimeType} + try { + // try getting it with node-fetch first + const response = await fetch(download_options.url) as Response + const file_options = await detectFilename({...download_options, response, dir}) + if (response.headers.get("content-length")) { + const favicon_stream = Readable.fromWeb(response.body as any) + await overwriteFile(file_options.abspath, favicon_stream) + result = { + ...file_options, + num_bytes: parseInt(response.headers.get("content-length") || '0'), + mimeType: response.headers.get("content-type"), + } + } else { + throw 'Failed to download favicon with fetch()' + } + } catch(err) { + // console.warn('[!] Failed to get favicon with node-fetch', err) + // fallback to getting it by opening a new browser tab + result = await download({...download_options, browser, dir, page}) + } + + // logs.push(...(result.logs || [])) + // errors.push(...(result.errors || [])) + + if (result.num_bytes) { + console.log(`[🌠] Saving page favicon (${result.url.substring(0, 35)}... ${result.mimeType})...`.padEnd(82), prettyPath(result.abspath)) + output_files[result.filename] = result + break // break here stops after the first successful download, comment out to keep going instead + } + } + const output_file = Object.values(output_files).sort(file => file.num_bytes).at(-1) + const favicon_info = { + TYPE: 'favicon', + VERSION: version, + URL: original_url, + succeeded: !!output_file, + // stdout: JSON.stringify(logs), + // stderr: JSON.stringify(errors), + favicon_url: output_file?.url, + favicon_urls: Object.keys(favicon_downloads_to_try), + favicon_files: Object.keys(output_files).map(fname => fname.replace(dir, '.')), + favicon_filename: output_file?.filename, + favicon_num_bytes: output_file?.num_bytes, + } + await overwriteFile(FAVICON_PATH(page), favicon_info) + + return favicon_info +} + +async function saveTitle(page, {original_url, version}) { + const title_from_browser = (await page.title()) || null + const title_from_js = await page.evaluate(() => document?.title || null) + const title_from_html = await page.evaluate(() => document?.querySelector('title')?.innerText || null) + const title_from_og = await page.evaluate(() => document?.querySelector('meta[property="og:title"]')?.getAttribute('content') || null) + + // best guess at best title = longest title + const title = ([title_from_html, title_from_og, title_from_js, title_from_browser] + .filter(title => title) + .sort((a, b) => b.length - a.length)[0] || '') + .replaceAll('\n', ' ') + + if (title?.length) { + console.log(`[📗] Saving page title (${title.substring(0, 40)})...`.padEnd(82), prettyPath(TITLE_PATH(page))) + await overwriteFile(TITLE_PATH(page), title) + } + + const title_info = { + TYPE: 'title', + VERSION: version, + URL: original_url, + title, + title_from_html, + title_from_og, + title_from_js, + title_from_browser, + } + const title_json_path = TITLE_PATH(page).replace('.txt', '.json') + await overwriteFile(title_json_path, title_info) + + return title_info +} + +async function saveRaw(page, {main_response}) { + const response = main_response + if (!response) { + console.warn('[âš ī¸] Failed to save page RAW bytes, main_response is null', response) + } + const dir = RAW_PATH(page) + await fs.promises.mkdir(dir, {recursive: true}) + + const {url, abspath, mimeType} = await detectFilename({page, response, dir}) + + console.log(`[🔟] Saving raw response bytes (${mimeType})...`.padEnd(82), prettyPath(abspath)) + + await download({page, response, abspath}) + return abspath +} + +async function saveSourceMaps(page, {original_url, version}) { + console.log(`[🐛] Saving source maps to ./responses/all/*.{js,css}.map...`) + + const response_index_path = path.join(RESPONSES_PATH(page), 'index.jsonl') + const response_index = await fs.promises.readFile(response_index_path, 'utf-8') + + const urls_to_download = [] + + for (const response of response_index.split('\n')) { + try { + const {url, extension} = JSON.parse(response) + if (['css', 'js'].includes(extension?.toLowerCase())) { + urls_to_download.push(url + '.map') + } + } catch(err) { continue } + } + + // TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata + // fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created + await page.evaluate(async (urls_to_download) => { + const promises = [] + for (const sourcemap_url in urls_to_download) { + promises.push(fetch(sourcemap_url)) + } + return Promise.allSettled(promises) + }, urls_to_download) + + return { + TYPE: 'sourcemaps', + URL: original_url, + VERSION: version, + sourcemaps: urls_to_download, + } +} + +async function saveRequests(page, {original_url, version, traffic_log}) { + console.log(`[đŸ“ŧ] Saving requests log (${Object.keys(traffic_log).length})...`.padEnd(82), prettyPath(REQUESTS_PATH(page))) + + const requests_info = { + TYPE: 'requests', + VERSION: version, + URL: original_url, + requests: traffic_log, + } + + await overwriteFile(REQUESTS_PATH(page), requests_info) + + return requests_info +} + +async function saveRedirects(page, {original_url, main_response, traffic_log, redirects, version}) { + const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0] + const main_response_traffic = traffic_log[main_request_id] || {} + + const url_from_browser = await page.url() || null + const url_from_request = ( + main_response?.request()?.url() + || main_response_traffic['Network.requestWillBeSent']?.request?.url + || null) + const url_from_response = ( + main_response?.url() + || main_response_traffic['Network.responseReceived']?.main_response?.url + || null) + + const http_redirects = + Object.values(traffic_log) + .filter(event => event['Network.requestWillBeSent']?.redirectResponse) + .map(event => event['Network.requestWillBeSent']) + .map(requestWillBeSent => ({ + url: requestWillBeSent.request.url, + src: requestWillBeSent.redirectResponse.url, + status: requestWillBeSent.redirectResponse.status, + loaderId: requestWillBeSent.loaderId, + requestId: requestWillBeSent.requestId, + wallTime: requestWillBeSent.wallTime, + initiator: requestWillBeSent.initiator, + isMainFrame: (requestWillBeSent.loaderId == main_request_id), + })) + + const url_parsed = new URL(url_from_response || url_from_request || url_from_browser) + + const redirects_info = { + TYPE: 'redirects', + VERSION: version, + URL: original_url, + url_parsed, + url_from_request, + url_from_response, + url_from_browser, + redirects_from_browser: redirects, + redirects_from_http: http_redirects, + } + console.log(`[🔗] Saving page redirects log (${http_redirects.length})...`.padEnd(82), prettyPath(REDIRECTS_PATH(page))) + + await overwriteFile(REDIRECTS_PATH(page), redirects_info) + + return redirects_info +} + +async function saveHeaders(page, {original_url, version, traffic_log}) { + const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0] + const main_response_traffic = traffic_log[main_request_id] || {} + + // combine base request with browser-added request headers + const request = {...main_response_traffic['Network.requestWillBeSent']?.request} + const request_extra_headers = main_response_traffic['Network.requestWillBeSentExtraInfo']?.headers || {} + request.headers = {...request.headers, ...request_extra_headers} + + // combine base response with browser-added response headers + const response = {...main_response_traffic['Network.responseReceived']?.response} + const response_extra_headers = main_response_traffic['Network.responseReceivedExtraInfo']?.headers || {} + response.headers = {...response.headers, ...response_extra_headers} + + const headers_info = { + TYPE: 'headers', + VERSION: version, + URL: original_url, + request, + response, + } + + const num_headers = Object.keys({...request.headers, ...response.headers}).length + if (num_headers) { + console.log(`[👾] Saving main request & response headers (${num_headers})...`.padEnd(82), prettyPath(HEADERS_PATH(page))) + await overwriteFile(HEADERS_PATH(page), headers_info) + } + + return headers_info +} + +async function saveSSL(page, {original_url, version, traffic_log}) { + const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0] + const main_response_traffic = traffic_log[main_request_id] || {} + + const relevant_response_keys = [ + 'url', + 'status', + 'mimeType', + 'connectionReused', + 'remoteIPAddress', + 'remotePort', + 'fromServiceWorker', + 'encodedDataLength', + 'protocol', + 'alternateProtocolUsage', + 'securityState', + 'securityDetails', + ] + let ssl_info = Object.entries(main_response_traffic['Network.responseReceived']?.response || {}) + .reduce((obj, [key, val]) => { + if (relevant_response_keys.includes(key)) { + obj[key] = val + } + return obj + }, {}) as any + + // TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store + // const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url}) + // ssl_info.sslCertSha256 = '' + + ssl_info = { + TYPE: 'ssl', + VERSION: version, + URL: original_url, + ...ssl_info, + } + + if (Object.keys(ssl_info).length-3) { + console.log(`[🔏] Saving page SSL details (${ssl_info?.securityDetails?.protocol})...`.padEnd(82), prettyPath(SSL_PATH(page))) + await overwriteFile(SSL_PATH(page), ssl_info) + } + + return ssl_info +} + + +async function saveDOM(page, {original_url, version}) { + const html = await page.content(); + console.log(`[📖] Saving DOM dump (${html.length})...`.padEnd(82), prettyPath(DOM_PATH(page))) + const html_with_header = + `\n${html}` + await overwriteFile(DOM_PATH(page), html_with_header) + return DOM_PATH(page) +} + +async function saveBodyText(page, _page_state) { + const innerText = await page.evaluate(() => document?.body?.innerText); + + if (innerText?.length) { + console.log(`[📃] Saving body text (${innerText.length})...`.padEnd(82), prettyPath(BODYTEXT_PATH(page))) + await overwriteFile(BODYTEXT_PATH(page), innerText) + } + + // // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText) + // const innerText = await page.$eval('*', (el) => { + // const selection = window.getSelection(); + // const range = document.createRange(); + // range.selectNode(el); + // selection.removeAllRanges(); + // selection.addRange(range); + // return window.getSelection().toString(); + // }); + + return innerText +} + +async function savePandoc(page, { original_url, version }) { + console.log(`[📒] Converting DOM HTML to markdown with Pandoc...`.padEnd(82), prettyPath(PANDOC_PATH(page))) + + let dom_paths = [DOM_PATH(page), SINGLEFILE_PATH(page)].filter(fs.existsSync) + if (!dom_paths) return null + const dom_path = dom_paths[0] + + var stdout: string = '' + var stderr: string = '' + let result: any = null + const BIN_NAME = 'pandoc' + // pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate + const args = [ + BIN_NAME, + '--from=html', + '--to=markdown_github', + '--wrap=none', + '--citeproc', + '--highlight-style=kate', + `--output='${PANDOC_PATH(page)}'`, + dom_path, + ] + try { + ;({ stdout, stderr } = await exec(args.join(' '))); + stdout = stdout.toString().trim() + if (!stdout) throw 'Got empty result!' + result = { + TYPE: 'pandoc', + VERSION: version, + URL: original_url, + cmd: args, + markdown_file: PANDOC_PATH(page), + } + } catch (parse_err) { + console.warn('[❌] Failed to run Pandoc HTML to MD conversion', parse_err, stderr) + } + if (!stdout) {return null} + await overwriteFile( + PANDOC_PATH(page), + stdout, + ) + + // pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate + const reverse_conversion_args = [ + BIN_NAME, + '--from=markdown_github', + '--to=html', + '--wrap=none', + '--citeproc', + '--highlight-style=kate', + `--output='${PANDOC_PATH(page).replace('.md', '.html')}'`, + PANDOC_PATH(page), + ] + try { + ; ({ stdout, stderr } = await exec(reverse_conversion_args.join(' '))); + stdout = stdout.toString().trim() + if (!stdout) throw 'Got empty result!' + result = { + ...result, + html_file: PANDOC_PATH(page).replace('.md', '.html'), + } + } catch (parse_err) { + console.warn('[❌] Failed to run Pandoc MD to HTML conversion', parse_err, stderr) + } + if (!result) { return null } + await overwriteFile( + PANDOC_PATH(page).replace('.md', '.html'), + result, + ) + + return result +} + +async function saveReadability(page, {original_url, version}) { + const url = await page.url() + let html = '' + let article = null + try { + html = await page.content() + if (html.length > 14_000_000) { + console.warn('[âš ī¸] Truncating readability article text because html is too long...', html.length) + html = html.substring(0, 13_900_000) + } + const virtualConsole = new VirtualConsole() + const dom = new JSDOM(html, {url, virtualConsole}) + const reader = new Readability(dom.window.document); + article = reader.parse() + } catch(err) { + console.warn(`[❌] Failed to get readability article text`) + return null + } + if (article) { + console.log(`[📜] Saving readability article text (${article.textContent?.length})...`.padEnd(82), prettyPath(READABILITY_PATH(page))) + const {content, textContent, ...metadata} = article + if (content.trim()) { + await overwriteFile(READABILITY_PATH(page).replace('.json', '.html'), content); + } + if (textContent.trim()) { + await overwriteFile(READABILITY_PATH(page).replace('.json', '.txt'), textContent); + } + const readability_info = { + TYPE: 'readability', + VERSION: version, + URL: original_url, + ...metadata, + } + await overwriteFile(READABILITY_PATH(page), readability_info) + return readability_info + } + return null +} + +async function saveAccessibility(page, {original_url, version}) { + // get accessibility tree + const accessibility_tree = await page.accessibility.snapshot({interestingOnly: true}); + // console.log(accessibility_tree); + + // get iframe tree + const iframes = [] + function dumpFrameTree(frame, indent='>') { + iframes.push(indent + frame.url()); + for (const child of frame.childFrames()) { + dumpFrameTree(child, indent + '>'); + } + } + dumpFrameTree(page.mainFrame(), ''); + // console.log(iframes) + + // generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.) + const outline = await page.evaluate(() => { + const headings = [] + for (const elem of [...document.querySelectorAll("h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe")] as HTMLElement[]) { + + // skip a tags that aren't named anchors + if (elem.tagName.toLowerCase() == 'a' && !(elem as HTMLAnchorElement).name) continue + + // e.g. article #main-article + const elem_id = ((typeof elem.id === 'string' && elem.id) || (elem as HTMLAnchorElement).name || elem.ariaLabel || elem.role || '') + const elem_classes = elem.className.trim().split(' ').slice(0, 3).join(' .') || '' + const elem_action = (elem as any).action?.split('/')?.slice(-1)?.join('/') + const summary = elem.innerText.length > 128 + ? `${elem.innerText?.slice(0, 128)}...` + : elem.innerText + + let prefix = '' + let title = (elem_id ? `#${elem_id}` : '') + if (!title && elem_classes) title = `.${elem_classes}` + if (elem_action) title = `${title} /${elem_action}` + if (summary) title = `${title}: ${summary}` + + // if elem is a header, prepend a #### prefix based on its level + const level = Number(elem.tagName.toLowerCase().replace('h', '')) + if (!isNaN(level)) { + prefix = '#'.repeat(level) + title = elem.innerText || elem_id || elem_classes + } else { + // set prefix to element's breadcrumb path + let node = elem + const parents = [elem.tagName?.toLowerCase().trim()] + while (node) { + // add each parent element's name to the path + // const elem_type = node.tagName?.toLowerCase().trim() || '' + // if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) { + // parents.unshift(elem_type); + // } + parents.unshift('') // add emptystring to abbreviate path as >>>> istead of main>article>header>div>... + node = node.parentNode as HTMLElement + } + prefix = parents.join('>') + } + // strip all repeated whitespace and newlines + title = title.replaceAll('\n', ' ').replace(/\s+/g, ' ').trim() + + if (prefix) { + headings.push(`${prefix} ${title}`) + } + } + // console.log(headings.join('\n')) + return headings + }) + + console.log(`[đŸŠŧ] Saving accessibility outline (${Object.keys(accessibility_tree).length})...`.padEnd(82), prettyPath(ACCESIBILITY_PATH(page))) + // console.log(outline.filter(line => line.startsWith('#')).join('\n')) + + const accessibility_info = { + TYPE: 'accessibility', + VERSION: version, + URL: original_url, + iframes, + headings: outline, + tree: accessibility_tree, + } + + await overwriteFile( + ACCESIBILITY_PATH(page), + accessibility_info, + ) + + return accessibility_info +} + +async function saveSEO(page, {original_url, version}) { + // collect all tags into dict + const seo_vars = await page.evaluate(() => + [...document.querySelectorAll('meta')] + .map(tag => ({key: tag.getAttribute('name') || tag.getAttribute('property') || '', value: tag.getAttribute('content') || ''})) + .filter(obj => obj.key && obj.value) + .sort((a, b) => a.value.length - b.value.length) + .reduce((acc, node) => {acc[node.key] = node.value; return acc}, {}) + ) + + const seo_info = { + TYPE: 'seo', + VERSION: version, + URL: original_url, + ...seo_vars, + } + + const num_vars = Object.keys(seo_vars).length + if (num_vars) { + console.log(`[🔎] Saving page SEO metadata (${num_vars})...`.padEnd(82), prettyPath(SEO_PATH(page))) + await overwriteFile(SEO_PATH(page), seo_info) + } + + return seo_info +} + +async function saveOutlinks(page, {original_url, version}) { + // TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop + + + // Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030): + const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi; + + const filterW3Urls = (urls) => + urls.filter(url => + url && !url.startsWith('http://www.w3.org/')) + + const filterDataUrls = (urls) => + urls.filter(url => + url && !url.startsWith('data:')) + + const html = await page.content(); + + const raw = html?.match(LINK_REGEX) || []; + + const hrefs = await page.$$eval( + "pierce/a[href]", + elems => elems + .map(elem => elem.href) + .filter(url => url), + ); + + const links = await page.$$eval( + "pierce/link[href]", + elems => elems + .map(({rel, href}) => ({rel, href})) + .filter(({rel, href}) => rel !== 'stylesheet') + .reduce((collection, entry) => { + const {rel, href} = entry + const non_empty_rel = collection[href]?.rel || rel + collection[href] = {rel: non_empty_rel, href} + return collection + }, {}) + ); + + const iframes = await page.$$eval( + "pierce/iframe[src]", + elems => elems.map(iframe => iframe.src).filter(url => url) + ); + + const images = await page.$$eval( + "pierce/img[src]", + elems => elems.map(img => img.src).filter(url => url && !url.startsWith('data:')) + ); + + + const css_images = await page.$$eval( + "pierce/*", + elems => elems + .map(elem => { + const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i; + const bg_img = window.getComputedStyle(elem, null).getPropertyValue('background-image') + const bg_url = css_url_ptn.exec(bg_img) + return bg_url ? bg_url[1] : null + }) + ) + + const css_stylesheets = await page.$$eval( + "pierce/link[rel=stylesheet]", + elems => elems.map(elem => elem.href).filter(url => url) + ); + + const js_scripts = await page.$$eval( + "pierce/script[src]", + elems => elems.map(elem => elem.src).filter(url => url) + ); + + const outlinks_info = { + TYPE: 'outlinks', + VERSION: version, + URL: original_url, + raw: [...new Set(filterDataUrls(filterW3Urls(raw)))], + hrefs: [...new Set(filterDataUrls(hrefs))], + links: [...Object.values(links)], + iframes: [...new Set(iframes)], + images: [...new Set(filterDataUrls(images))], + css_images: [...new Set(filterDataUrls(css_images))], + css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))], + js_scripts: [...new Set(filterDataUrls(js_scripts))], + } + + if (raw?.length || hrefs?.length || links?.length || iframes?.length) { + console.log(`[đŸ–‡ī¸] Saving page outgoing links (${raw?.length || hrefs?.length})...`.padEnd(82+1), prettyPath(OUTLINKS_PATH(page))) + + await overwriteFile(OUTLINKS_PATH(page), outlinks_info) + } + return outlinks_info +} + + +async function saveAuthStorage(page, {client, version, original_url}) { + const url = original_url || await page.url() + if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null + if (!SAVE_AUTH_STORAGE) return null + + // const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies + const auth_from_browser = { + cookies: (await client.send('Network.getAllCookies')).cookies, + localStorage: {}, + sessionStorage: {}, + } + + // attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921) + try { + auth_from_browser.localStorage = (await page.evaluate(() => + JSON.parse(JSON.stringify({[window.location.origin]: window.localStorage})))) + } catch(err) { + throw `Failed to get page window.localStorage! ${err}` + } + try { + auth_from_browser.sessionStorage = (await page.evaluate(() => + JSON.parse(JSON.stringify({[window.location.origin]: window.sessionStorage})))) + } catch(err) { + throw `Failed to get page window.sessionStorage! ${err}` + } + + // WARNING: small TOCTTOU gap between this read-before-write and the write below + // can possibly overwrite changes made by other processes in this gap + const auth_on_disk = await loadAuthStorage(page, {client}, {apply: false}) + + const cookies = dedupeCookies([...auth_on_disk.cookies, ...auth_from_browser.cookies]) + + const auth_info = { + TYPE: 'auth', + VERSION: version, + URL: original_url, + cookies: cookies, + sessionStorage: merge(auth_on_disk.sessionStorage, auth_from_browser.sessionStorage), + localStorage: merge(auth_on_disk.localStorage, auth_from_browser.localStorage), + } + // console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`) + + console.log(`[đŸĒ] Saving cookies/localStorage/sessionStorage (${auth_info.cookies.length})...`.padEnd(82), prettyPath(AUTH_JSON_PATH)); + await overwriteFile(AUTH_JSON_PATH, auth_info); + + // Write to cookies.txt file using tough-cookie + @root/file-cookie-store + await saveCookiesTxt(cookies) + + return auth_info +} + +async function saveCookiesTxt(cookies) { + const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false}) + const cookie_jar = new ToughCookie.CookieJar(cookies_store) + cookie_jar.setCookieAsync = util.promisify(cookie_jar.setCookie) + cookies_store.saveAsync = util.promisify(cookies_store.save) + for (const cookie of cookies) { + const cookie_for_tough = { + domain: cookie.domain, + path: cookie.path, + key: cookie.name, + value: cookie.value, + expires: (new Date(cookie.expires * 1000)).toISOString(), + hostOnly: cookie.domain.startsWith('.'), + secure: cookie.secure, + } + // console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough) + const parsed_cookie = ToughCookie.Cookie.fromJSON(cookie_for_tough) + // console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie) + try { + // assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time + let url = cookie.secure ? 'https://' : 'http://' + if (cookie.domain.startsWith('.')) { + url = url + cookie.domain.slice(1) + } else { + url = url + cookie.domain + } + if (cookie.sourcePort && ![80, 443].includes(cookie.sourcePort)) { + url = `${url}:${cookie.sourcePort}` + } + url = `${url}${cookie.path || ''}` + await cookie_jar.setCookieAsync(parsed_cookie, url, {ignoreError: true}) + } catch(err) { + console.error('[❌] Failed to dump browser cookie for cookies.txt...', cookie_for_tough, '->', parsed_cookie, err) + } + } + console.log(`[đŸĒ] Saving cookies TXT (${cookies.length})...`.padEnd(82), prettyPath(COOKIES_TXT_PATH)); + await cookies_store.saveAsync() +} + +async function saveMetrics(page, {original_url, version, start_time, start_ts, traffic_log, redirects}) { + const end_time = (new Date()).toISOString() + const end_ts = Date.now() + const metrics_info = { + TYPE: 'metrics', + VERSION: version, + URL: original_url, + ...(await page.metrics()), + start_time, + start_ts, + end_time, + end_ts, + duration: (end_ts - start_ts), + num_requests: traffic_log.length, + num_redirects: Object.keys(redirects).length -1, + } + + console.log(`[đŸŽī¸] Saving final summary + timing metrics...`.padEnd(82+1), prettyPath(METRICS_PATH(page))) + await overwriteFile(METRICS_PATH(page), metrics_info) + + return metrics_info +} + + +/******************************************************************************/ +/******************************************************************************/ + +/**************************** Utility Helpers *********************************/ + + +function hashCode(str) { + // get a simple integer hash for a given string (based on java String#hashCode) + // useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256 + let hash = 0; + for (let i=0; i string)='id') { + // uniqueify an array of objects by a value within them, key can be name of attr or getter function + // > iter = [{id: 1}, {id: 2}, {id: 1}] + // > Object.entries(iter) = [ + // [ '0', { id: 1 } ], + // [ '1', { id: 2 } ], + // [ '2', { id: 1 } ] ] + // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}} + + // > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}} + // > Object.entries(iter) = [ + // [ 'a1', { id: 1 } ], + // [ 'b2', { id: 2 } ], + // [ 'a3', { id: 1 } ] + // ] + // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}} + + const key_type = (typeof key) + if (!['function', 'string'].includes(key_type)) + throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id' + + const key_func = (key_type === 'string') + ? (entry_obj, idx) => entry_obj[(key as string)] + : (entry_obj, idx) => (key as Function)(entry_obj, idx) // otherwise key is a callback func + + const seen = {} + for (const [idx, entry_obj] of Object.entries(iter)) { + const unique_id = key_func(entry_obj, idx) + if (seen[unique_id] === undefined) { + seen[unique_id] = entry_obj + } + } + + return seen +} + +const wait = (ms: number) => new Promise(res => { + if (ms > 10_000) { + console.debug(`[â˛ī¸] Waiting ${Math.round(ms/1000)}s...`) + } + setTimeout(res, ms) +}) + +const TimeoutError = Symbol() +const withTimeout = (promise, ms) => { + // run a promise with a time limit, raises a TimeoutError if it fails + let timer + return Promise.race([ + promise, + new Promise((_r, reject) => + timer = setTimeout(reject, ms, TimeoutError) + ), + ]).finally(() => clearTimeout(timer)) +} + +const MAX_VALID_DATE = new Date('2150-01-01T00:00:00.000Z') +const MIN_VALID_DATE = new Date('2010-01-01T00:00:00.000Z') +const UNIX_EPOCH_DATE = new Date(0) + +const validateDate = (date, {min=MIN_VALID_DATE, max=MAX_VALID_DATE, singleton=UNIX_EPOCH_DATE}={}) => { + assert((date instanceof Date), `Got invalid type for Date: ${typeof date} ${date} (expected Date)`) + assert(String(date) !== 'Invalid Date', `Got invalid value for Date: ${typeof date} ${date}`) + if (Number(date) === Number(singleton)) return date // epoch singleton is always valid + assert(date < max, `Got Date that was higher than MAX_VALID_DATE=${max}`) + assert(date > min, `Got Date that was lower than MIN_VALID_DATE=${min}`) + return date +} + +const parseVersionDateStr = (yyyymmddtime) => { + // YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date + const is_only_numbers = /^\d+$/.test(yyyymmddtime.replace('.', '')) + assert(is_only_numbers, `Non-numeric characters in YYYYMMDD date are not allowed: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`) + + const num_digits = String(yyyymmddtime).split('.')[0].length + assert([17, 14, 12, 8].includes(num_digits), `Got invalid number of digits (${num_digits}) in YYYYMMDD date: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`) + + const [_all, yyyy, mm, dd, hr, min, sec, ms] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/.exec(yyyymmddtime) + assert(yyyy && mm && dd, `Could not find YYYYMMDD`) + const time_error_msg = `Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${hr}:${min || '__'}:${ms || '___'}` + if (ms) assert(hr && min && sec, time_error_msg) + if (sec) assert(hr && min, time_error_msg) + if (min) assert(hr, time_error_msg) + if (hr) assert (min, time_error_msg) + + const iso_str = `${yyyy}-${mm}-${dd}T${hr || '00'}:${min || '00'}:${sec || '00'}.${ms || '00'}Z` + const parsed_date = new Date(iso_str) + + return validateDate(parsed_date) // 1970-01-01T00:00:00.000Z (ISO format) +} + +const parseTimestampDateStr = (timestamp) => { + // 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date + timestamp = String(timestamp) + const is_only_numbers = /^\d+$/.test(timestamp.replace('.', '')) + assert(is_only_numbers, `Got invalid characters in timstamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`) + + const num_digits = String(timestamp).split('.')[0].length + assert([13, 10, 1].includes(num_digits), `Got invalid number of digits (${num_digits}) in timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`) + + let parsed_date = null + + if (num_digits === 13) { + parsed_date = new Date(Number(timestamp)) // 1709724291000 (unix timestamp w/ milliseconds) + } else if (num_digits === 10) { + parsed_date = new Date(Number(timestamp) * 1000) // 1709724291 (unix timestamp w/ seconds) + } else if (num_digits === 1) { + assert(String(timestamp) === '0', `Got invalid single-digit timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch)`) + parsed_date = UNIX_EPOCH_DATE + } + return validateDate(parsed_date) +} + +const parseISODateStr = (iso_str) => { + // 1970-01-01T00:00:00.000Z -> Date + const num_digits = String(iso_str).length + assert([24, 19, 16, 10].includes(num_digits), `Got invalid number of digits (${num_digits}) in ISO date: ${iso_str} (while trying 1970-01-01T00:00:00.000Z format)`) + + const parsed_date = new Date(iso_str) + return validateDate(parsed_date) +} + +const parseDate = (date) => { + // date === undefined => use today/now + // date === null => use unix epoch 0 aka 1970-01-01T00:00:00.000Z + // date *= YYYYMMDDHHMMSS => use a version date string (e.g. 20010131235958) + // date *= 1234567... => use a timestmap (e.g. 1709724291000) + // date *= 1970-01-01T... => use iso datetime (e.g. 1970-01-01T00:00:00.000Z) + // returns -> Date + + if (date === undefined) { + return (new Date()) // today (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682 + } + if (date === null || date == 0) { + return UNIX_EPOCH_DATE // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0 + } + if (date instanceof Date) { + return validateDate(date) // JS date Date('1970-01-01T00:00:00.000Z') + } + + if ((typeof date) === 'number') { + date = String(date) // unix timestamp e.g. 1717020154682 + } + assert((typeof date) === 'string', `Tried to parse date but got unsupported type ${(typeof date)}: ${date}`) + + const errors = [`Failed to parse Date from string: ${date}`] + try { + return parseVersionDateStr(date) + } catch(err) { errors.push(err) } + try { + return parseTimestampDateStr(date) + } catch(err) { errors.push(err) } + try { + return parseISODateStr(date) + } catch(err) { errors.push(err) } + + throw errors.join('\n') +} + +const versionStrFromDate = (date, {withDate=true, withTime=true, withSeconds=true, withMilliseconds=false}={}) => { + // takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD + const parsed_date = parseDate(date) + + const [date_iso, time_iso] = parsed_date.toISOString().split('T') // ['2001-01-31', '23:59:58.090Z'] + + const components_to_use = [] + if (withDate) { + components_to_use.push(date_iso.replaceAll('-', '')) // '20010131' + } + if (withTime) { + const [hr, min, sec, ms] = time_iso.replace('Z', '').replace('.', ':').split(':') // ['23', '59', '58', '090'] + components_to_use.push(hr) + components_to_use.push(min) + if (withSeconds) { + components_to_use.push(sec) + if (withMilliseconds) { + components_to_use.push(ms) + } + } + } + assert(components_to_use.length, 'At least one of {withDate, withTime} must be set.') + + const final_str = components_to_use.join('') // 20010131235958 + + assert(parseVersionDateStr(final_str)) // sanity check to make sure it parses correctly + + return final_str +} + +// test date functions: +// console.log(parseDate('20120131')) +// console.log(versionStrFromDate(parseDate('20120131'))) +// console.log(versionStrFromDate(parseDate('0'))) +// console.log(versionStrFromDate(parseDate(0))) +// console.log(versionStrFromDate(parseDate(null))) +// console.log(versionStrFromDate()) +// console.log(versionStrFromDate(parseDate('20120131235859090'))) +// console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z'))) +// console.log(versionStrFromDate(parseDate('2024-12-01T00:00'))) +// console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false})) + +const prettyPath = (path) => { + // return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy + return path.replace(DATA_DIR, './data') +} + +const pathIsHidden = (relpath) => { + // check if a path or any of the directories above it are hidden (e.g. ./some/.dir/abc or ./.DS_Store) + + // make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './') + let test_path = relpath + if (test_path.startsWith('./')) + test_path = test_path.substring(2) + if (!test_path.startsWith('/')) + test_path = path.join('/', test_path) + + // iterate through parents, checking if any parent is hidden until we reach / + while (test_path !== '/') { + const basename = path.basename(test_path) + if (basename.startsWith('.')) { + // console.log('PATH IS HIDDEN', relpath) + return true + } + // otherwise set test_path to parent dir and repeat + test_path = path.dirname(test_path) + } + return false +} + +const pathDepth = (child_path, relative_to='.') => { + // get the number of directory hops deep a child path is relative to '.' (or a given parent) + + if (child_path.startsWith('/') && !relative_to.startsWith('/')) { + // if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root + relative_to = fs.realpathSync(relative_to) + } + if (relative_to.startsWith('/') && !child_path.startsWith('/')) { + // same deal, either both paths have to be relative, or both have to be absolute + child_path = fs.realpathSync(child_path) + } + const relative_path_to_root = path.relative(relative_to, child_path) + const num_hops_down = relative_path_to_root.split('/').length + return num_hops_down +} + +interface DirentWithExtras extends fs.Dirent { + relpath: string, + abspath: string, + reldepth: number, +} + +async function getDirEntries(dir_path, {pwd=null, recursive=true, includeHidden=false, includeFiles=true, includeDirs=true, includeLinks=false, filter=null, maxdepth=-1}={}) { + // get the list of all sub-paths under a given path recursively + + // console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth}) + + pwd = pwd || dir_path + let dir_abspath = dir_path + + if (!dir_abspath.startsWith(pwd)) { + dir_abspath = path.join(pwd, dir_abspath) + } + + assert(fs.existsSync(dir_abspath), `Tried to get directory listing for dir that doesn't exist! ${prettyPath(dir_abspath)}`) + + return (await fs.promises.readdir(dir_abspath, { recursive, withFileTypes: true })) + .map((dirent: DirentWithExtras) => { + // filter combined with map because relpath is re-used in both operations + const relpath = path.join(path.relative(pwd, dirent.parentPath), dirent.name) + // console.log('CALCULATED RELATIVE PATH', relpath) + const abspath = path.join(dir_abspath, relpath) + const basename = path.basename(dirent.name) + if (!includeLinks && dirent.isSymbolicLink()) return null + if (!includeFiles && dirent.isFile()) return null + if (!includeDirs && dirent.isDirectory()) return null + if (!includeHidden && pathIsHidden(relpath)) return null + + dirent.relpath = relpath + dirent.abspath = abspath + dirent.reldepth = pathDepth(relpath) + // console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth) + + if (maxdepth >= 0) { + if ((dirent.reldepth-1) > maxdepth) return null + } + + if ((typeof filter) === 'function') { + const should_keep = filter({abspath, relpath, basename, dirent}) + if (!should_keep) { + // console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent}) + return null + } + } + + return relpath + }) + .filter(Boolean) + .sort() as string[] +} + + +async function getTotalSize(dir_or_file_path, {pwd=null, _cache=null, filter=null, subfiles=null}={}) { + // get the total size in bytes of a file or directory (recursively adds up file sizes within directory) + + // check _cache first + if (_cache && (dir_or_file_path in _cache)) + return _cache[dir_or_file_path] + + // make sure dir_or_file_path is under pwd + pwd = pwd || path.dirname(dir_or_file_path) + let abspath = dir_or_file_path + if (!dir_or_file_path.startsWith(pwd)) { + abspath = path.join(pwd, dir_or_file_path) + } + + // if it's a file, stat it and return the size + // console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd}) + const dirent = await fs.promises.stat(abspath) + if (dirent.isFile()) { + // console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath)) + return dirent.size + } + + // if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc. + if (!dirent.isDirectory()) return 0 + + // if it's a directory, size is the sum of all the sizes of files within + // console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath)) + let total_bytes = 0 + const files_within = subfiles || await getDirEntries(dir_or_file_path, { + pwd, + recursive: true, + includeDirs: false, + includeFiles: true, + filter, + }) + for (const subpath of files_within) { + total_bytes += await getTotalSize(subpath, {pwd, _cache, filter}) + } + return total_bytes +} + + +async function getDirSizes(dir_path, {pwd=null, subfiles=null, withRoot=true, filter=null, maxdepth=-1}={}) { + // get the size of a directory and all the files within (recursively) as a number of bytes + // dir_path: path absolute or relative path of the directory you want size info for + // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to + // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use + // withRoot: bool include a summary entry for the root dir_path dir in the list as '.' + // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false + // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity + + assert((await fs.promises.stat(dir_path)).isDirectory(), `Tried to calculate directory sizes but path is not a directory! ${dir_path}`) + pwd = pwd || dir_path + + // {'.': 246, 'example.json': 123, 'example2.txt': 123} + const sizes = {} + + // first collect the list of all sub-files recursively and calculate their sizes individually + const files_within = subfiles || await getDirEntries(dir_path, { + pwd, + recursive: true, + includeDirs: false, + includeFiles: true, + // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes + // it never makes sense to ignore subfiles beyond a certain depth for size calculation + filter, // filter is allowed though, useful to calculcate size of some subset of files that match a pattern + }) + for (const subpath of files_within) { + sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter}) + } + + // then calculate the top-level directory total as the sum of all the file sizes under it + const total_size = Object.values(sizes).reduce((a: number, b: number) => a + b, 0) + + // then calculate the subtotals of all the sub-directories + const subdirs_within = await getDirEntries(dir_path, {pwd, recursive: true, includeDirs: true, includeFiles: false, filter, maxdepth}) + for (const subpath of subdirs_within) { + sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter}) // uses _cache to avoid re-computing + } + + // if maxdepth is passed, filter results to only include paths shallower than max depth + if (maxdepth >= 0) { + for (const subpath of Object.keys(sizes)) { + if (pathDepth(subpath) > maxdepth) { + delete sizes[subpath] + } + } + } + + // set total_size last so it appears at the bottom of the object in logs for convenience + if (withRoot) { + sizes['.'] = total_size + } + + return sizes +} + + +async function getLargestPath(path_a, path_b) { + // compare two files/directories and return the largest one of the two (calculating size recursively) + + path_a = await fs.promises.realpath(path_a) + path_b = await fs.promises.realpath(path_b) + const size_a = await getTotalSize(path_a) + const size_b = await getTotalSize(path_b) + + // console.log('COMPARING', prettyPath(path_a), size_a, ' ', prettyPath(path_b), size_b) + + if (size_a > size_b) return path_a + return path_b +} + +async function findCommonAncestor(target_abspath, symlink_abspath, {relative=true, search_limit=DATA_DIR}: {relative?: boolean | string, search_limit?: string}={}) { + // given a target path and a symlink path, find the common ancestor path they both share + // (searches recursively through absolute path parent directories until a common dir is found, up to search_limit) + + search_limit = await fs.promises.realpath(search_limit) + + let relative_dir = search_limit + if ((typeof relative) === 'boolean') { + // if start dir is default, set it to symlinks directory path + if (relative) { + relative_dir = path.dirname(symlink_abspath) + } else { + relative_dir = search_limit + } + } else if ((typeof relative) === 'string') { + // if start dir is a string, get its absolute path + relative_dir = relative as string + } else { + throw `Got invalid type for relative path during common ancestor search: ${relative}` + } + + if ((await fs.promises.stat(relative_dir)).isFile()) { + // if start dir is a file, set it to its parent dir path + relative_dir = path.dirname(relative_dir) + } + assert( + (await fs.promises.stat(relative_dir)).isDirectory(), + `Tried to find common ancestor starting from invalid search directory:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: search dir does not exist or is not a directory: ❌ ${prettyPath(relative_dir)}`, + ) + + const symlink_filename = path.basename(symlink_abspath) + const target_filename = path.basename(target_abspath) + const symlink_parent_abspath = await fs.promises.realpath(path.dirname(symlink_abspath)) + const target_parent_abspath = await fs.promises.realpath(path.dirname(target_abspath)) + const search_dir_abspath = await fs.promises.realpath(relative_dir) + + let closest_common_ancestor = search_dir_abspath + + const isAncestorCommon = (ancestor) => ( + target_parent_abspath.startsWith(ancestor) + && symlink_parent_abspath.startsWith(ancestor)) + + // check if both src and target start with the same ancestor path + while (closest_common_ancestor !== search_limit) { + if (isAncestorCommon(closest_common_ancestor)) break + else { + // otherwise go up one directory and try again + // console.log(' ...going up a directory', prettyPath(closest_common_ancestor)+'/..') + closest_common_ancestor = path.dirname(closest_common_ancestor) + } + } + + assert( + isAncestorCommon(closest_common_ancestor), + `Tried to create relative symlink but could not find common ancestor:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: target path and symlink path are not both under:\n ❌ ${prettyPath(closest_common_ancestor)}`, + ) + + const symlink_to_ancestor_relpath = path.relative(symlink_parent_abspath, closest_common_ancestor) // ../../.. + const target_from_ancestor_relpath = path.join(path.relative(closest_common_ancestor, target_parent_abspath), target_filename) // 'archive/19999999.23423523' + const symlink_to_target_relpath = path.join(symlink_to_ancestor_relpath, target_from_ancestor_relpath) // '../../../archive/19999999.23423523' + + return { + closest_common_ancestor, + search_dir_abspath, + + target_abspath, + target_filename, + target_from_ancestor_relpath, + + symlink_abspath, + symlink_filename, + symlink_to_ancestor_relpath, + symlink_to_target_relpath, + } +} + +interface StatsWithExtras extends fs.Stats { + abspath: string + relpath?: string + reldepth?: number +} + +async function blockUntilExists(file_path, {timeout=7_500, min_bytes=0}={}) { + // wait up to timeout seconds until file we expect to exist appears on the filesystem + // (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up) + const interval = 250 + const max_tries = timeout / interval + let tries = 0 + + let abspath = null + while (tries < max_tries) { + try { + const abspath = await fs.promises.realpath(file_path) + assert(fs.existsSync(abspath)) + + const dirent = await fs.promises.stat(abspath) as StatsWithExtras + dirent.abspath = abspath + + if (min_bytes && (dirent.size < min_bytes)) { + assert(dirent.size >= 1) + // this is a valid warning but unfortunately its too common to bother showing: + // console.warn(`[âš ī¸] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path)) + } + + return dirent + } catch(err) { + const waited = (tries * interval) + if (waited === 5_000) { + console.warn(`[âš ī¸] Waited >${waited/1000}s for file to appear (is filesystem or bg task running slow?):`, prettyPath(file_path)) + } + await wait(interval) + tries++ + } + } + throw `Expected file does not exist after ${timeout/1000}s: ${prettyPath(file_path)}` +} + +async function overwriteSymlink(target_path, symlink_path, {relative=true, mkdirs=false, search_limit=DATA_DIR, timeout=5_000}: {relative?: boolean | string, mkdirs?: boolean, search_limit?: string, timeout?: number}={}) { + // create a symlink from symlink_path -> target_path + // relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR) + // mkdirs: true => optionally creates symlink parent dirs automatically) + + // make sure target file actually exists first + let target_dirent + try { + target_dirent = await blockUntilExists(target_path, {timeout}) + } catch(err) { + throw `Tried to create symlink pointing to file that does not exist:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)}\n ${err}` + } + const target_abspath = target_dirent.abspath + const target_filename = path.basename(target_abspath) + const target_parent_abspath = path.dirname(target_abspath) + + // make sure target is a valid file or directory and not a special character/block device/other weird file + const target_is_dir = target_dirent.isDirectory() + const target_is_file = target_dirent.isFile() + assert(target_is_dir || target_is_file, `Tried to create symlink to an unsupported file type:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)} (expected file or directory)`) + + // create symlink file parent directories if needed + const symlink_filename = path.basename(symlink_path) + const symlink_parent_dir = path.dirname(symlink_path) + if (mkdirs) { + await fs.promises.mkdir(symlink_parent_dir, {recursive: true}) + } + try { + assert((await fs.promises.stat(symlink_parent_dir)).isDirectory()) + } catch(err) { + throw `Tried to create symlink in a directory that doesn't exist:\n 🔗 ${symlink_parent_dir}❌/${symlink_filename}\n -> ${target_path}\n ${err}` + } + const symlink_parent_abspath = await fs.promises.realpath(symlink_parent_dir) + const symlink_abspath = path.join(symlink_parent_abspath, symlink_filename) + + // determine nearest common ancestor between symlink dir and target dir + const { + closest_common_ancestor, + symlink_to_ancestor_relpath, + target_from_ancestor_relpath, + symlink_to_target_relpath, + } = await findCommonAncestor(target_abspath, symlink_abspath, {relative, search_limit}) + + // set final target path to abspath or relative path depending on {relative} options + let target_path_final + if (relative) { + // make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath) + target_path_final = symlink_to_target_relpath + // console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`) + } else { + // make symlink into an absolute path (verbatim passed target_path) + target_path_final = target_path + // console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)') + } + + // remove any existing symlink at destination if there is already one there + const random_nonce = crypto.randomBytes(16).toString('hex').substring(0, 8) + const symlink_temp_path = `${symlink_abspath}.${random_nonce}.dup` + try { await fs.promises.unlink(symlink_abspath) } catch(err) {} + try { await fs.promises.unlink(symlink_temp_path) } catch(err) {} + + // create the symlink and check that it works after creation + let created_symlink = null + try { + created_symlink = symlink_temp_path + await fs.promises.symlink(target_path_final, symlink_temp_path) + created_symlink = symlink_abspath + await fs.promises.rename(symlink_temp_path, symlink_abspath) + } catch(err) { + if (String(err).includes('EISDIR')) { + // console.warn('[âš ī¸] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath)) + + // no real recourse in this situation, and its too noisy to log every time this happens + // it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving: + // ${symlink_abspath}.${random_nonce}.dup + } else { + console.warn('[âš ī¸] Failed to create symlink', prettyPath(created_symlink), err) + } + } + + let dirent + try { + dirent = await blockUntilExists(created_symlink, {timeout, min_bytes: 0}) + // best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition: + // assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different + } catch(err) { + throw `Symlink created but does not seem to resolve to intended file:\n 🔗 ${symlink_path}\n -> ❌ ${target_path}\n actual=${dirent?.abspath}\n expected=${target_abspath}\n ${err}` + } + + return { + symlink_path, + symlink_abspath: created_symlink, + symlink_filename: path.basename(created_symlink), + symlink_parent_abspath, + symlink_to_ancestor_relpath, + symlink_to_target_relpath, + + target_path, + target_abspath, + target_filename, + target_parent_abspath, + target_from_ancestor_relpath, + target_path_final, + target_is_dir, + target_is_file, + target_is_relative: Boolean(relative), + + closest_common_ancestor, + } +} + +// test symlink and common ancestor finding +// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json')) +// console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'})) +// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269')) +// console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'})) + + + +async function overwriteDir(path) { + // delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink) + try { + await fs.promises.rm(path, { recursive: true, force: true }); + } catch(err) {} + + await fs.promises.mkdir(path, {recursive: true}) + + return path +} + +async function overwriteFile(path, contents, options={encoding: 'utf8', flag: 'w', flush: false, block: true}) { + // write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable) + + const block_until_created = options.block || true + delete options.block + + try { + // delete any existing symlink/file present at the destination path + // (important otherwise we may write into an existing symlink by accident) + await fs.promises.unlink(path) + } catch(err) {} + + try { + let nonce = 1 + while ((await fs.promises.stat(path)).isDirectory()) { + // if we try to write a file to a path that already has a directory in that location + // (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json) + path = path.replace(`.${nonce-1}`, '') + `.${nonce}` + nonce++; + if (nonce > 20) throw `Too many conflicting files while trying to write to ${prettyPath(path)}` + } + } catch(err) { + if (!String(err).includes('no such file or directory')) { + console.warn('[âš ī¸] Warning: Problem with conflicting directory at while trying to write file', err) + } + } + + // refuse writing undefined/null/function because its likely an error and not intended + const content_is_null = (contents === null) || (contents === undefined) + const content_is_func = (typeof contents === 'function') + if (content_is_null || content_is_func) { + throw `Cannot write ${typeof contents} ${contents} to file: ${path}` + } + + // Numbers, BigInts, and Booleans can be cast to strings, then wrt + const content_is_primitive = ['number', 'bigint', 'boolean'].includes(typeof contents) + if (content_is_primitive) { + contents = String(contents) + await fs.promises.writeFile(path, contents, options as any) + if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)}) + return path + } + + // Strings and Buffers can be written directly to file + const content_is_string = (typeof contents === 'string' || contents instanceof String) + const content_is_buffer = Buffer.isBuffer(contents) + if (content_is_string || content_is_buffer) { + await fs.promises.writeFile(path, contents, options as any) + if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)}) + return path + } + + // WritableStream objects can be piped into file + const content_is_stream = (contents?.pipe) + if (content_is_stream) { + const stream_byte_length = contents.writableLength + const dest_file = fs.createWriteStream(path); + await finished(contents.pipe(dest_file)) + if (block_until_created) await blockUntilExists(path, {min_bytes: stream_byte_length}) + return path + } + + // Objects and Arrays can be JSON-stringified then written into file + const content_is_obj = (Array.isArray(contents) || typeof contents === 'object') + if (content_is_obj) { + contents = JSON.stringify(contents, null, 4) + await fs.promises.writeFile(path, contents, options as any) + if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)}) + return path + } + throw `Cannot write contents of type ${typeof contents} to file: ${path} < ${contents}` +} + + +async function saveExecResult(bin, args=null, {original_url, version}, {cwd='.', timeout=60_000, ...spawn_options}={}) { + assert(bin) + assert(original_url && original_url.includes('://')) + assert(version) + + const BIN_NAME = bin // 'yt-dlp' + const ARGS = args || [] // ['--some-arg', '--some-other-arg'] + const CWD = cwd || process.cwd() // '.' + const TIMEOUT = 300_000 // 5min timeout + const PATH = process.env.PATH + + await fs.promises.mkdir(cwd, {recursive: true}) + + // quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567 + const cmd_log_str = `#!/usr/bin/env bash +TYPE="${BIN_NAME}" +URL="${original_url}" +VERSION="${version}" + +TIMEOUT=${TIMEOUT} +CWD="${CWD}" +PATH="${PATH}:$PATH" + +${BIN_NAME} ${ARGS.map(arg => JSON.stringify(arg)).join(' ')} +` + const cmd_log = path.join(cwd, 'cmd.sh') + await overwriteFile(cmd_log, cmd_log_str) + + const stdout_log = fs.createWriteStream(path.join(cwd, 'stdout.log')) + const stderr_log = fs.createWriteStream(path.join(cwd, 'stderr.log')) + + const start_date = new Date() + const start_ts = Number(start_date) + const start_time = start_date.toISOString() + + const child = child_process.spawn( + BIN_NAME, + ARGS, + { + cwd: CWD, + timeout: TIMEOUT, // 5min timeout + stdio: [null, 'pipe', 'pipe'], // ./stdout.log 2>./stderr.log + // detached: true, // run in background, don't block on response + ...(spawn_options || {}), + }, + ) + child.stdout.setEncoding('utf8') + child.stdout.pipe(stdout_log) + child.stderr.setEncoding('utf8') + child.stderr.pipe(stderr_log) + + const exec_info = { + TYPE: BIN_NAME, + URL: original_url, + VERSION: version, + bin_name: BIN_NAME, + args: ARGS, + timeout: TIMEOUT, + hostname: os.hostname(), + bin_paths: PATH, + ppid: process.pid, + pid: child.pid, + start_ts, + start_time, + end_time: null, + end_ts: null, + duration: null, + returncode: null, + log_files: {}, + output_files: {}, + } + + // promise that resolves when the command is finished executing + // TODO: refactor to use withTimeout + const getResult = (timeout=TIMEOUT) => + new Promise((resolve, reject) => { + const loop = setInterval(() => { + if (exec_info.end_time) { + clearInterval(loop) + clearTimeout(timer) + resolve(exec_info) + } + }, 100) + + const timer = setTimeout(() => { + clearInterval(loop) + if (!exec_info.end_time) { + reject(new Error(`Process ${BIN_NAME} did not finish within TIMEOUT=${TIMEOUT}`)) + } + }, timeout); + }) + + const logFilesFilter = ({relpath}) => + ['cmd.sh', 'stdout.log', 'stderr.log'].includes(relpath) + + const outputFilesFilter = ({relpath}) => + !['cmd.sh', 'stdout.log', 'stderr.log', 'index.json'].includes(relpath) + + const getOutputFiles = async (filter=outputFilesFilter) => { + return await getDirInfo(CWD, {filter, withHelpers: false, withRoot: false, maxdepth: 6}) + } + + child.on('close', async (returncode) => { + const end_date = new Date() + exec_info.returncode = returncode + exec_info.pid = child.pid + exec_info.end_ts = Number(end_date) + exec_info.end_time = end_date.toISOString() + exec_info.duration = exec_info.end_ts - exec_info.start_ts + exec_info.log_files = await getOutputFiles(logFilesFilter) + exec_info.output_files = await getOutputFiles(outputFilesFilter) + + const end_metadata = ` +# END_TIME="${exec_info.end_time}" +# DURATION=${exec_info.duration} +# RETURNCODE=${exec_info.returncode } +` + await fs.promises.appendFile(cmd_log, end_metadata) + + // write exec_info json (which includes file list) to CWD/index.json + await overwriteFile(path.join(CWD, 'index.json'), exec_info) + }) + // child.unref() // dont wait for child process to close + + const start_metadata = ` +#################### LAST RUN LOG #################### +# HOSTNAME="${exec_info.hostname}" +# PPID=${exec_info.ppid} +# PID=${exec_info.pid} +# START_TIME="${exec_info.start_time}" +` + await fs.promises.appendFile(cmd_log, start_metadata) + + return { + ...exec_info, + getResult, + } +} + +const HASH_CACHE = {} + +async function sha256File(file_path: string, {pwd=null}: {pwd?: string}={}) { + return new Promise((resolve, reject) => { + pwd = pwd || path.dirname(file_path); + if (!file_path.startsWith(pwd)) { + file_path = path.join(pwd, file_path); + } + + const dirent = fs.statSync(file_path); + const abspath = fs.realpathSync(file_path); + const cache_key = `${abspath}:${dirent.size}:${dirent.mtimeMs}`; // PATH:SIZE:LAST_MODIFIED_TIME + if (cache_key in HASH_CACHE) { + resolve(HASH_CACHE[cache_key]); + } + + const hash = crypto.createHash('sha256'); + const rs = fs.createReadStream(abspath); + rs.on('error', reject); + rs.on('data', chunk => hash.update(chunk)); + rs.on('end', () => { + const final_hash = hash.digest('hex'); + HASH_CACHE[cache_key] = final_hash; + resolve(final_hash); + }); + }) as Promise +} + +async function getDirSha256(dir_path, {pwd=null, withRoot=true, filter=null, maxdepth=-1, subfiles=null}={}) { + // console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth}) + // dir_path: path absolute or relative path of the directory you want the merkle sha256 for + // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to + // withRoot: bool include a summary entry for the root dir_path dir in the list as '.' + // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false + // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity + // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use + + pwd = pwd || dir_path + if (!dir_path.startsWith(pwd)) { + dir_path = path.join(pwd, dir_path) + } + + const dirent = await fs.promises.stat(dir_path) + assert(dirent.isDirectory(), `Tried to compute sha256 of path but missing or not a directory! ${dir_path}`) + assert((maxdepth >= -1), `maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${maxdepth})`) + + // assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`) + + // get the sha256 of every file in a directory recursively (excluding hidden files and symlinks) + // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum + const all_subfiles = (subfiles as string[]) || await getDirEntries(dir_path, { + pwd, + recursive: true, + includeFiles: true, + includeDirs: false, + + // ~~maxdepth,~~ // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes. + // it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are + // only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce + // many different hashes for the same directory, which is not something we need/want polluting the hash space. + + + filter, // we do however allow passing a manual filter funcs which does actually affect the hash + // this is useful to allow quick checks to see whether a certain subset of files has changed or not + }) + const hashes: {[key: string]: string} = {} + let hashable_summary_str = '' + for (const subfile of all_subfiles) { + // {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...} + hashes[subfile] = await sha256File(subfile, {pwd}) + const relpath = path.relative(await fs.promises.realpath(dir_path), await fs.promises.realpath(path.join(pwd, subfile))) + hashable_summary_str += `${hashes[subfile]} ./${relpath}\n` + } + // console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length) + + // get list of subdirectories and recursively hash every subdirectory + // EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort + const subdirs = await getDirEntries(dir_path, {pwd, recursive: true, includeHidden: false, includeDirs: true, includeFiles: false, filter, maxdepth}) + + // for each subdirectory, get its hash recursively and store it in the hash list + for (const subdir of subdirs) { + // console.log('GETTING SUBDIR HASH', subdir) + // a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden) + const subdir_hashes = await getDirSha256( + subdir, + {pwd, withRoot: true, filter, maxdepth: 0}, + ) + hashes[subdir] = subdir_hashes['.'] + } + // console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length) + + // filter results if maxdepth is provided + if (maxdepth >= 0) { + for (const subpath of Object.keys(hashes)) { + if (pathDepth(subpath) > maxdepth) { + delete hashes[subpath] + } + } + } + // console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length) + + // calculate the hash of the root '.' folder by hashing all of hashes of its contents + // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum + if (withRoot) { + // pass the first command's output containing the file list + hashes into another sha256 + // to get the final hash of the whole directory combined + // console.log('CALCULATING FINAL ROOT HASH for ', dir_path) + // console.log(hashable_summary_str) + hashes['.'] = crypto.createHash('sha256').update(hashable_summary_str).digest('hex') as string + // console.log('--->', hashes['.']) + } + + return hashes +} + + +async function getDirInfo(dir_path, {pwd=null, withRoot=true, withHelpers=true, filter=null, maxdepth=-1, subfiles=null}={}) { + // get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes + // dir_path: path absolute or relative path of the directory you want size info for + // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to + // withRoot: bool include a summary entry for the root dir_path dir in the list as '.' + // withHelpers: bool attach many extra helper attrs/funcs to results (beyond JSON-serializable core data) + // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false + // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity + // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use + + // { + // ... + // 'example.txt': { ... }, + // 'foobar/example.mp3': { ... }, + // '.': { // this is the fully agumented result when withHelpers=true + // is_file: false, + // is_dir: true, + // filename: '.', + // basename: '1709039915.378868', + // mimeType: 'inode/directory' + // extension: undefined, + // num_bytes: 11540961, + // num_subpaths: 15, + // sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da', + // reldepth: 1, + // relpath: './', + // cwd: '/opt/archivebox/data/archive/1709039915.378868/', + // dirname: '/opt/archivebox/data/archive', + // abspath: '/opt/archivebox/data/archive/1709039915.378868', + // dirent: Stats { + // dev: 16777240, + // mode: 16895, + // uid: 501, + // ... + // mtimeMs: 1717160622956.1357, + // ctimeMs: 1717160622956.1357, + // }, + // created: '2024-05-31T13:03:42.956Z', + // modified: '2024-05-31T13:03:42.956Z', + // summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)', + // helptext: 'Verify these hashes by running:\n' + + // ' cd /opt/archivebox/data/archive/1709039915.378868 \n' + + // " find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum", + // }, + // } + + pwd = pwd || dir_path + if (!dir_path.startsWith(pwd)) { + dir_path = path.join(pwd, dir_path) + } + + // calculate hashes and sizes recursively + const hashes = await getDirSha256(dir_path, {pwd, withRoot, filter, maxdepth, subfiles}) + const sizes = await getDirSizes(dir_path, {pwd, withRoot, filter, maxdepth, subfiles}) + + const num_total_subpaths = Object.keys(hashes).filter(name => name !== '.').length + + const details = {} + for (const [filename, sha256] of Object.entries(hashes)) { + if (filename === '.' && !withRoot) continue + + const abspath = await fs.promises.realpath(path.join(dir_path, filename)) + const dirent = await fs.promises.stat(abspath) + const num_subpaths = Object.keys(hashes).filter(subpath => subpath.startsWith(filename + '/')).length + const is_file = dirent.isFile() + const is_dir = dirent.isDirectory() + + // bare-bones info suitable for JSON dumps/exports + const basic_info = { + sha256, + num_bytes: sizes[filename], + created: (new Date(dirent.ctimeMs)).toISOString(), + mimeType: undefined, + extension: undefined, + num_subpaths: undefined, + } + if (is_dir) { + basic_info.mimeType = 'inode/directory' + basic_info.extension = undefined + basic_info.num_subpaths = (filename === '.') ? num_total_subpaths : num_subpaths + } + if (is_file) { + basic_info.mimeType = mime.lookup(abspath) || null + basic_info.extension = path.extname(filename) + basic_info.num_subpaths = undefined + } + + // extra helpers suitable for usage in other areas of the codebase + const info_with_helpers = { + ...basic_info, + filename, + basename: path.basename(abspath), + dirname: path.dirname(abspath), + cwd: dir_path, + relpath: is_dir ? (filename + '/') : filename, + reldepth: pathDepth(filename), + abspath, + is_file, + is_dir, + dirent, + modified: (new Date(dirent.mtimeMs)).toISOString(), + summary: `${prettyPath(abspath)} (${basic_info.mimeType} ${Math.round(basic_info.num_bytes/1000)}kb ${sha256.substring(0, 8)})`, + helptext: undefined, + } + if (filename === '.') { + info_with_helpers.helptext = `Verify these hashes by running:\n cd ${prettyPath(abspath)} \n find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum` + } + + if ((typeof filter) === 'function') { + if (!filter(info_with_helpers)) continue + } + + details[filename] = withHelpers ? info_with_helpers : basic_info + } + return details +} + +// console.log(await getDirSha256( +// '/opt/archivebox/data/archive/1709039915.378868/', +// { +// withRoot: true, +// maxdepth: -1, +// filter: ({relpath}) => relpath.startsWith('versions'), +// }, +// )) +// console.log(await getDirSizes( +// '/opt/archivebox/data/archive/1709039915.378868/', +// { +// withRoot: false, +// maxdepth: 2, +// filter: ({relpath}) => !relpath.startsWith('versions'), +// }, +// )) +// console.log(await getDirInfo( +// '/opt/archivebox/data/archive/1709039915.378868/', +// { +// withRoot: true, +// withHelpers: true, +// maxdepth: 1, +// // filter: ({relpath}) => relpath.startsWith('versions'), +// }, +// )) + +type DetectFilenameOptions = { + url?: string, + response?: HTTPResponse | Response, + page?: Page, + dir?: string, + abspath?: string, + filename?: string, + basename?: string, + extension?: string, + mimeType?: string, + resourceType?: string, +} + +async function detectFilename({ url, response, page, dir, abspath, filename, basename, extension, mimeType, resourceType }: DetectFilenameOptions) { + // this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType + // from the URL (+ any enforced path components passed in via args) + // example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico' + // + // it has some quirks that are specific to archiving and may not behave as you expect + // e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page + // this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip + // however, if the url has no extension, e.g. https://example.com/error it will + // auto-detect the mimeType based on the response and append an extension, saving as error.html + // + // âš ī¸ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here âš ī¸ + // this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension, + // which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe? + // if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy) + + if (!(response || page)) throw 'Either a page or a response must be provided in order to detect mimeType & URL' + + if (response && (typeof response.headers !== 'function')) { + const node_fetch_response: Response = response as Response + response = { + url: () => node_fetch_response.url, + headers: () => node_fetch_response.headers, + } as unknown as HTTPResponse + } + response = response as HTTPResponse + + url = url || response?.url() || (await page.url()) + if (!url) throw 'URL was not provided and could not be detected from {response, page}' + + // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other + try { + resourceType = resourceType || response?.request()?.resourceType() + } catch(err) { + // ignore, sometimes response is null/not available + } + const resourceTypeToMimeType = { + 'Stylesheet': 'text/css', + 'Script': 'application/x-javascript', + 'WebSocket': 'application/json', + 'Website': 'text/html', + } + + mimeType = mimeType || resourceTypeToMimeType[resourceType] // guess extension based on request resourceType + extension = extension || (mimeType ? mime.extension(mimeType) : null) + + // handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED) + if (url.startsWith('about:blank')) { + filename = 'about_blank' + mimeType = 'text/html' + } + else if (url.startsWith('data:')) { + filename = `data__${hashCode(url)}` + } + + // console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType}) + + if (abspath) { + if (dir || filename || basename || extension) + throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)' + var {dir, base: filename, ext: extension, name: basename} = path.parse(abspath) + // path.parse('/home/user/dir/file.txt') returns: + // { root: '/', + // dir: '/home/user/dir', + // base: 'file.txt', + // ext: '.txt', + // name: 'file' } + } else { + dir = dir || path.resolve(process.cwd()) + + filename = filename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1.zip + || (new URL(url)).pathname.split('/').at(-1) // https://example.com/file124.rss => file124.rss prefers last component of path with no query/hash, falls back to domain name if no path + || 'index' // https://example.com/abc/def/ => index.html + //|| (new URL(url)).hostname.replaceAll('.', '_') // https://example.com => example_com (but if disabled, this would be index.html) + } + if (!filename) throw 'filename/abspath were not passed and could not be detected from url' + + const path_extname = path.extname(filename) + const resp_mimetype = response && ( + (response as any).mimeType + || response.headers()['content-type']?.split(';')[0] + || resourceTypeToMimeType[resourceType] + || 'application/octet-stream' + ) + + mimeType = mimeType // https://example.com/a.1.zip?e.pdf=2#g.h=3 => application/x-zip prefers mimetype based on extension in path, falls back to response mimeType + || (path_extname && mime.lookup(path_extname)) // https://example.com/file124.rss => application/rss+xml + || resp_mimetype // https://example.com/get?type=png => image/png + + extension = extension + || (path_extname && path_extname.replace('.', '')) // https://example.com/a.1.zip?e.pdf=2#g.h=3 => zip prefers extension in path, falls back to response mimeType's suggested extension + || (resp_mimetype && mime.extension(resp_mimetype)) // https://example.com => html + || '' // https://example.com/websocket.1 => + if (extension.startsWith('.')) + extension = extension.slice(1) + + basename = basename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1 prefers to filename in path (without extension), falls back to domain name + || (path.parse(filename).name) // https://mp4dl.example.com => mp4dl_example_com + + basename = basename.slice(0, 120) // truncate at 120 characters (leaving 8 chars for .ext) + basename = basename.replace(/[^a-zA-Z0-9%+?&=@;_ \.-]/g, '') // strip characters not allowed in filenames + + filename = basename + '.' + extension + + if (filename.endsWith('.')) + filename = filename.slice(0, -1) + + abspath = abspath || path.join(dir, filename) + + // console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType}) + + return { + url, + dir, + abspath, + filename, + basename, + extension, + mimeType, + resourceType, + resp_mimetype, + } +} + +interface DowloadOptions extends DetectFilenameOptions { + browser?: Browser + expected_mimetype?: string + timeout?: number +} + +async function download({ url, browser, page, response, dir, abspath, filename, basename, extension, expected_mimetype, timeout }: DowloadOptions) { + url = url || (response as HTTPResponse)?.url() || (await page?.url()) + ALREADY_ARCHIVED.add(url.slice(0, 4096)) // prevent running whole archive task on tabs we create for just for downloading + + browser = browser || (page && (await page.browser())) + timeout = timeout || 120_000 + expected_mimetype = expected_mimetype || '' + let newPage = null + let errors = [] + let num_bytes = 0 + let bytesBuffer = null + + + // if we need to fetch the url (i.e. it's not already been requested) + if (!response) { + if (!browser) throw 'No {browser} or {page} was provided to download with' + newPage = await browser.newPage() + if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground + response = await newPage.goto(url, {timeout: timeout, waitUntil: 'networkidle0'}) + if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground + } + url = url || (response as HTTPResponse)?.url() || (await newPage?.url()) || (await page?.url()); + const response_mimetype = (response as HTTPResponse).headers()['content-type']?.split(';')[0] || 'text/html' + + // detect the filename we should write to based on provided url/response/page/filename/extension suggestions + var { + dir, + abspath, + filename, + basename, + extension, + mimeType, + } = await detectFilename({url, page, response, dir, abspath, filename, basename, extension, mimeType}) + + // if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure + if (!response_mimetype.startsWith(expected_mimetype)) { + errors.push(`Expected ${expected_mimetype} but got ${response_mimetype}`) + } else { + + // download the file using puppeteer's response.buffer() + try { + // write the response bytes into the output file + bytesBuffer = await (response as HTTPResponse).buffer() + await overwriteFile(abspath, bytesBuffer) + num_bytes = bytesBuffer.length + } catch(err) { + errors.push(err) + } + + // security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous) + fs.access(abspath, fs.constants.X_OK, (err) => { + if (!err) console.warn( + '[âš ī¸] SECURITY WARNING: Downloaded file appears to be executable:', prettyPath(abspath), + '\n (be careful running untrusted programs downloaded from the internet!)' + ) + }) + } + + // if we opened a dedicated page for downloading, close it now + if (newPage) { + newPage.close() + } + + if (errors.length) { + // console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4)) + } else { + console.log(`[💾] Downloaded ${url.substring(0, 40)} (${num_bytes} ${mimeType})...`.padEnd(82), prettyPath(abspath)) + } + + return { + url, response, errors, + dir, abspath, filename, basename, extension, mimeType, + bytesBuffer, num_bytes, + } +} + + +/************************** Puppeteer Launching *******************************/ + + +async function startCluster(puppeteer, args=CHROME_ARGS_DEFAULT) { + console.log(`[🎭] Launching ${CHROME_CLUSTER_WORKERS}x Chromium browsers with puppeteer-cluster:`.padEnd(82), prettyPath(CHROME_PROFILE_PATH)) + const cluster = await Cluster.launch({ + puppeteer, + monitor: true, + maxConcurrency: CHROME_CLUSTER_WORKERS, + sameDomainDelay: 2550, + workerCreationDelay: 250, + timeout: 300_000, // total ms timeout for an entire task (1000ms * 60s * 5m) + concurrency: Cluster.CONCURRENCY_PAGE, // share cookies between all tabs in a given browser + puppeteerOptions: { + args, // all the chrome launch CLI args + ignoreDefaultArgs: true, // trust me, we have enough args already... + // dumpio: true, // full debug log output, super noisy + } + }) + console.log('*************************************************************************') + return cluster +} + +async function remoteBrowser(puppeteer, {browserURL, browserWSEndpoint}) { + console.log('[🎭] Connecting Puppeteer to existing Chromium browser via:', browserURL || browserWSEndpoint) + let completed_initial_connection = false + const browser = await puppeteer.connect({browserURL, browserWSEndpoint, defaultViewport: null, targetFilter: () => completed_initial_connection}) + completed_initial_connection = true + console.log('*************************************************************************') + return browser +} + +async function startBrowser(puppeteer, args=CHROME_ARGS_DEFAULT) { + console.log('[🎭] Launching Puppeteer Chromium browser...'.padEnd(82+1), prettyPath(CHROME_PROFILE_PATH)) + + const browser = await puppeteer.launch({ignoreDefaultArgs: true, args, dumpio: true}) + globalThis.browser = browser + console.log('*************************************************************************') + + // store all active tabs on global var by url for easier vscode interactive debugging + const storeTabForDebugger = async (target) => { + try { + globalThis.tabs = globalThis.tabs || {} + const url = target.url() + const page = await target.page() + if (!page || page?.isClosed()) { + delete globalThis.tabs[url] + } else { + globalThis.tab = page + globalThis.tabs[url] = page + } + } catch(err) {console.warn(err)} + } + browser.on('targetcreated', storeTabForDebugger) + browser.on('targetchanged', storeTabForDebugger) + browser.on('targetdestroyed', storeTabForDebugger) + + // wait for initial extension background.js/service worker targets to load + await wait(3_000) + + // prime the extensions cache + const extensions = await getChromeExtensionsFromCache({browser}) + globalThis.extensions = extensions // for easier debugging only + + // give the user 2min to check any issues with the initial startup pages (bot profile pages), + // solve captchas, re-login, etc. then close them after that to save resources + const startup_pages = (await browser.pages()) + const startup_page_close_delay = 120_000 + setTimeout(async () => { + for (const page of startup_pages) { + try { await page.close() } catch(err) { /* page may already be closed by now, which is fine */ } + } + + }, startup_page_close_delay) + + // setup any extensions that need final runtime configuration using their options pages + // await setup2CaptchaExtension({browser, extensions}) + + // open a placeholder page so browser window stays open when there are no active archiving pages + // (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs) + const empty_page = await browser.newPage() + await wait(250) + await empty_page.goto('chrome://version') + await wait(500) + console.log('*************************************************************************') + + return browser +} + +async function startAPIServer(port=API_SERVER_PORT, host=API_SERVER_HOST, taskCallback=null) { + // taskCallback should be an async function that takes ({url}) => and does something with it + assert(taskCallback && (typeof taskCallback === 'function')) + + const server = createServer(async (req, res) => { + if (req.method === 'POST') { + console.log(`[API][POST] ${req.url}`) + let body = ''; + + req.on('data', (chunk) => { + body += chunk; + }); + + req.on('end', () => { + try { + const jsonData = JSON.parse(body); + // Process the JSON data + console.log(jsonData); + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ message: 'JSON data received' })); + } catch (error) { + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Invalid JSON data' })); + } + }); + } else if (req.method === 'GET') { + console.log(`[API][GET] ${req.url}`) + const parsedUrl = new URL(`http://${host}:${port}${req.url}`) + const query = new URLSearchParams(parsedUrl.search); + const url = query.get('url'); + if (url && url.includes('://')) { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + try { + await taskCallback({url}) + res.end(`${url}\n${TASK_PATH(url)}`); + } catch(err) { + res.end(`${url}\n${TASK_PATH(url)}\n${err}`); + } + } else { + res.writeHead(500, { 'Content-Type': 'text/plain' }); + res.end(`Bad URL: ${url}\n\nExpected: /?url=https://example.com/url/to/archive`); + } + } else { + res.writeHead(405, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Method not allowed' })); + } + }) + + server.listen(port, host, () => { + console.log(`[🎰] API Server listening for requests on http://${host}:${port}/?url=...`); + }) + console.log('*************************************************************************') + + return server +} + +async function main(urls, cluster=CHROME_CLUSTER) { + process.chdir(DATA_DIR) + + const extensions = await getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) + const args = getChromeArgs({...CHROME_LAUNCH_OPTIONS, CHROME_EXTENSIONS: extensions}) + const preferences = getChromePreferences({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_DOWNLOADS_DIR, CHROME_EXTENSIONS: extensions}) + const Puppeteer = applyChromePreferences(PupeteerExtra, CHROME_PREFERENCES_PATH, preferences) + + Puppeteer.use(StealthPlugin()); + // Puppeteer.use(ReplPlugin()); + // handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore: + // Puppeteer.use(RecaptchaPlugin({ + // provider: {id: '2captcha', token: API_KEY_2CAPTCHA}, + // visualFeedback: true, + // })) + // const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker') + // puppeteer.use(AdblockerPlugin({ blockTrackers: true })) + + if (cluster) { + // launch browser with multiple tabs w/ puppeteer + const cluster = await startCluster(Puppeteer, args) + + const handleTask = async ({url}) => cluster.queue(url, botArchiveTask) + const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask) + + console.log('[📋] Running tasks in parallel with puppeteer cluster...') + for (const url of urls) { + if (fs.existsSync(path.join(TASK_PATH(url), 'aiqa.json'))) { + try { + JSON.parse((await fs.promises.readFile(path.join(TASK_PATH(url), 'aiqa.json'))).toString()) + console.log(' skipping (already present):', TASK_PATH(url), url) + continue + } catch(err) { + // pass + } + } + cluster.queue(url, botArchiveTask) + await wait(3_000) + } + + await cluster.idle(); + await cluster.close(); + } else { + // launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer + const browser = await startBrowser(Puppeteer, args) + // const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint}) + + // run speedtest in the background + speedtest({browser}) + + const handleTask = async ({url}) => await botArchiveTask({page: (await browser.newPage()), data: url}) + const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask) + + // wait for any pre-run setup tasks or server requests + await wait(5_000) + + let num_succeeded = 0 + let num_failed = 0 + + console.log(`[📋] Running ${urls.length} tasks sequentially with puppeteer browser...`) + for (const url of urls) { + const run_count = (num_succeeded + num_failed) || 1 + + // check if task should be run or skipped based on existing snapshot data present in directory + const metrics_path = path.join(TASK_PATH(url), 'metrics.json') + const screenshot_path = path.join(TASK_PATH(url), 'screenrecording.gif') + const aiqa_path = path.join(TASK_PATH(url), 'aiqa.json') + const versions_path = path.join(TASK_PATH(url), 'versions') + if (fs.existsSync(metrics_path) && fs.existsSync(screenshot_path) && fs.existsSync(aiqa_path) && fs.existsSync(versions_path)) { + try { + const ai_qa_result = JSON.parse(await fs.promises.readFile(aiqa_path, 'utf-8')) + console.log(prettyPath(TASK_PATH(url)), `${ai_qa_result.pct_visible}%`, ai_qa_result.website_brand_name, url.substring(0, 80)) + assert(ai_qa_result.website_brand_name) + continue + } catch(err) { + // pass + } + } + let delay = 0 + + // create a new browser page and run the archiving task + const page = (await browser.newPage()) + try { + console.log(ANSI.black + `◤==============================================================================[${String(run_count).padStart(3)}]/[${urls.length}]â—Ĩ` + ANSI.reset) + await botArchiveTask({page, data: url}) + delay = 1_000 + num_succeeded += 1 + } catch(err) { + console.error('[❌] Archiving task failed!', url) + console.error(err) + num_failed += 1 + delay = 15_000 // extra delay if there are errors + } + console.log(ANSI.black + `â—Ŗ==============================================================================[☑ ${num_succeeded}][🆇 ${num_failed}]â—ĸ` + ANSI.reset) + + // check for abnormally high failure rates and exit early if needed + const failure_pct = Math.round((num_failed/run_count) * 100) + if (failure_pct > 50) { + if (run_count > 5) { + console.warn(`[âš ī¸] ${failure_pct}% Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail...`) + } + if (run_count > 10) { + throw `Too many tasks failed in a row! Quitting early after ${run_count}/${urls.length} tasks.` + } + } + + // increase the delay between tasks based on the ratio of how many are failing:succeeding + delay = Math.pow(4, (num_failed/(num_succeeded + 3))) * delay + // e.g. 0:1 failure ratio == 1 * delay == 1 ~ 15s + // 1:1 failure ratio == 5 * delay == 5 ~ 1m ... 5^(failed:succeeded) exponential increase + // 2:1 failure ratio == 25 * delay == 25s ~ 6m + // 3:1 failure ratio == 125 * delay == 2m ~ 31m + // etc... + // up to 1hr+ + delay = Math.min(delay, 3_600_000) // 1hr maximum delay between tasks + delay = Math.max(delay, 1_000) // 1s minimum delay between tasks + if (delay > 2_500) { + console.log('... waiting', Math.round(delay/1000), 'seconds (self rate-limit)...') + } + await wait(delay) // base ratelimit + console.log() + } + + + if (PASSIVE_ARCHIVING) { + // replace these as-needed: + const browserURL = 'http://localhost:9222/' + const browserWSEndpoint = 'ws://localhost:9222/devtools/browser' + + const driver_browser = browser || await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint}) + const archiver_browser = {} //await startBrowser(Puppeteer, args) + + const extensions = await getChromeExtensionsFromCache({browser: driver_browser}) + + // close both browsers if either one is closed + let browser_is_open = true + driver_browser.on('disconnected', async () => {browser_is_open = false}) // await archiver_browser.close() + // archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()}) + + // handle any tab navigation to a new URL in the driver browser + const handleUserNavigation = async (target) => { + const url = target.url() + const page = await target.page() + // const client = await target.createCDPSession() + + if (target.type() == 'page' && page && url) { + console.log(ANSI.black + '==============================================================================' + ANSI.reset) + console.warn('[➕] DRIVER BROWSER NAVIGATED:', ANSI.blue, url, ANSI.reset) + + try { + await passiveArchiveTask({browser: driver_browser, page, url}) + await wait(3_000) + } catch(err) { + console.error('[❌] Archiving task failed!', url) + console.error(err) + await wait(10_000) // base ratelimit + } + console.log(ANSI.black + '==============================================================================' + ANSI.reset) + // await client.send('Page.enable') + // await client.send('Page.setWebLifecycleState', {state: 'active'}) + } + // await client.send('Runtime.runIfWaitingForDebugger') + } + + // setup handler to archive new page whenever one is opened + driver_browser.on('targetcreated', handleUserNavigation) + driver_browser.on('targetchanged', handleUserNavigation) + + console.log('------------------------------------------------------') + console.log('[👀] Waiting for browser tabs to be opened by human...') + while (browser_is_open) { + await wait(2_000) + } + } else { + while (true) { + await wait(2_000) + } + } + + await browser.close() + } + console.log('[✅] Finished all tasks and stopped browsers.') + process.exit(0); +} + + +/******************************************************************************/ +if (import.meta.main) { + main(URLS).catch(console.error); +} + +/******************************************************************************/ + +// if we want to handle CLI args in the future, minimist is great: +// var argv = require('minimist')(process.argv.slice(2)); +// console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium +// const {url, binpath, datadir} = argv; + + +// OLD CODE, may be useful in the future if we need audio in screenrecordings: +// async function setupScreenrecordingWithAudio(page, wss) { +// console.log('[đŸŽŦ] Setting up screen-recording plugin...'); +// const stream_port = (await wss).options.port; +// // streamPage = await (page.browser()).newPage() +// await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`) +// +// // puppeteer-stream recording start +// streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page)) +// stream = await getStream(page, { +// audio: true, +// video: true, +// bitsPerSecond: 8000000, // 1080p video +// }); +// stream.pipe(streamFile); +// return {stream, streamFile} +// +// // puppeteer-stream recording stop & cleanup +// if (stream && streamFile) { +// await stream?.destroy(); +// streamFile?.close(); +// // await streamPage.close(); +// } +// } diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index e68b9dc1c6..0000000000 --- a/package-lock.json +++ /dev/null @@ -1,2198 +0,0 @@ -{ - "name": "archivebox", - "version": "0.6.0", - "lockfileVersion": 1, - "requires": true, - "dependencies": { - "@babel/runtime-corejs2": { - "version": "7.13.10", - "resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.13.10.tgz", - "integrity": "sha512-rZw5P1ZewO6XZTDxtXuAuAFUqfNXyM8HO/9WiaDd34Anka0uFTpo0RvBLeV775AEE/zKw3LQB+poZw/O9lrZBg==", - "requires": { - "core-js": "^2.6.5", - "regenerator-runtime": "^0.13.4" - } - }, - "@mozilla/readability": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.4.1.tgz", - "integrity": "sha512-yar/f0w0fRUVM895s6yd5Z2oIxjG/6c3ROB/uQboSOBaDlri/nqI4aKtdqrldWciTLcdpjB2Z6MiVF2Bl9b8LA==" - }, - "@postlight/ci-failed-test-reporter": { - "version": "1.0.26", - "resolved": "https://registry.npmjs.org/@postlight/ci-failed-test-reporter/-/ci-failed-test-reporter-1.0.26.tgz", - "integrity": "sha512-xfXzxyOiKhco7Gx2OLTe9b66b0dFJw0elg94KGHoQXf5F8JqqFvdo35J8wayGOor64CSMvn+4Bjlu2NKV+yTGA==", - "requires": { - "dotenv": "^6.2.0", - "node-fetch": "^2.3.0" - } - }, - "@postlight/mercury-parser": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@postlight/mercury-parser/-/mercury-parser-2.2.0.tgz", - "integrity": "sha512-nz6dIvCAaiv74o1vhhp0BRsUe+ysPbZG5mVNpJmgLoI/goOBqRMM3Yg8uXtnv++e7tzKFSXdls8b2/zKk1qL0Q==", - "requires": { - "@babel/runtime-corejs2": "^7.2.0", - "@postlight/ci-failed-test-reporter": "^1.0", - "browser-request": "github:postlight/browser-request#feat-add-headers-to-response", - "cheerio": "^0.22.0", - "difflib": "github:postlight/difflib.js", - "ellipsize": "0.1.0", - "iconv-lite": "0.5.0", - "jquery": "^3.4.1", - "moment": "^2.23.0", - "moment-parseformat": "3.0.0", - "moment-timezone": "0.5.26", - "postman-request": "^2.88.1-postman.7.1", - "request-promise": "^4.2.2", - "string-direction": "^0.1.2", - "turndown": "^5.0.3", - "url": "^0.11.0", - "valid-url": "^1.0.9", - "wuzzy": "^0.1.4", - "yargs-parser": "^13.0.0" - }, - "dependencies": { - "http-headers": { - "version": "3.0.2", - "bundled": true, - "requires": { - "next-line": "^1.1.0" - } - }, - "jquery": { - "version": "3.4.1", - "bundled": true - }, - "moment": { - "version": "2.23.0", - "bundled": true - }, - "moment-timezone": { - "version": "0.5.26", - "bundled": true, - "requires": { - "moment": ">= 2.9.0" - } - }, - "next-line": { - "version": "1.1.0", - "bundled": true - } - } - }, - "@postman/form-data": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/@postman/form-data/-/form-data-3.1.1.tgz", - "integrity": "sha512-vjh8Q2a8S6UCm/KKs31XFJqEEgmbjBmpPNVV2eVav6905wyFAwaUOBGA1NPBI4ERH9MMZc6w0umFgM6WbEPMdg==", - "requires": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "mime-types": "^2.1.12" - } - }, - "@postman/tunnel-agent": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/@postman/tunnel-agent/-/tunnel-agent-0.6.3.tgz", - "integrity": "sha512-k57fzmAZ2PJGxfOA4SGR05ejorHbVAa/84Hxh/2nAztjNXc4ZjOm9NUIk6/Z6LCrBvJZqjRZbN8e/nROVUPVdg==", - "requires": { - "safe-buffer": "^5.0.1" - } - }, - "@types/node": { - "version": "14.14.37", - "resolved": "https://registry.npmjs.org/@types/node/-/node-14.14.37.tgz", - "integrity": "sha512-XYmBiy+ohOR4Lh5jE379fV2IU+6Jn4g5qASinhitfyO71b/sCo6MKsMLF5tc7Zf2CE8hViVQyYSobJNke8OvUw==", - "optional": true - }, - "@types/yauzl": { - "version": "2.9.1", - "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.9.1.tgz", - "integrity": "sha512-A1b8SU4D10uoPjwb0lnHmmu8wZhR9d+9o2PKBQT2jU5YPTKsxac6M2qGAdY7VcL+dHHhARVUDmeg0rOrcd9EjA==", - "optional": true, - "requires": { - "@types/node": "*" - } - }, - "abab": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.5.tgz", - "integrity": "sha512-9IK9EadsbHo6jLWIpxpR6pL0sazTXV6+SQv25ZB+F7Bj9mJNaOc4nCRabwd5M/JwmUa8idz6Eci6eKfJryPs6Q==" - }, - "acorn": { - "version": "5.7.4", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-5.7.4.tgz", - "integrity": "sha512-1D++VG7BhrtvQpNbBzovKNc1FLGGEE/oGe7b9xJm/RFHMBeUaUGpluV9RLjZa47YFdPcDAenEYuq9pQPcMdLJg==" - }, - "acorn-globals": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-4.3.4.tgz", - "integrity": "sha512-clfQEh21R+D0leSbUdWf3OcfqyaCSAQ8Ryq00bofSekfr9W8u1jyYZo6ir0xu9Gtcf7BjcHJpnbZH7JOCpP60A==", - "requires": { - "acorn": "^6.0.1", - "acorn-walk": "^6.0.1" - }, - "dependencies": { - "acorn": { - "version": "6.4.2", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-6.4.2.tgz", - "integrity": "sha512-XtGIhXwF8YM8bJhGxG5kXgjkEuNGLTkoYqVE+KMR+aspr4KGYmKYg7yUe3KghyQ9yheNwLnjmzh/7+gfDBmHCQ==" - } - } - }, - "acorn-walk": { - "version": "6.2.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-6.2.0.tgz", - "integrity": "sha512-7evsyfH1cLOCdAzZAd43Cic04yKydNx0cF+7tiA19p1XnLLPU4dpCQOqpjqwokFe//vS0QqfqqjCS2JkiIs0cA==" - }, - "agent-base": { - "version": "5.1.1", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-5.1.1.tgz", - "integrity": "sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g==" - }, - "ajv": { - "version": "6.12.6", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", - "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", - "requires": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - } - }, - "ansi-regex": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", - "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==" - }, - "ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "requires": { - "color-convert": "^2.0.1" - } - }, - "array-equal": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/array-equal/-/array-equal-1.0.0.tgz", - "integrity": "sha1-jCpe8kcv2ep0KwTHenUJO6J1fJM=" - }, - "asn1": { - "version": "0.2.4", - "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", - "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==", - "requires": { - "safer-buffer": "~2.1.0" - } - }, - "assert-plus": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", - "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" - }, - "async-limiter": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", - "integrity": "sha512-csOlWGAcRFJaI6m+F2WKdnMKr4HhdhFVBk0H/QbJFMCr+uO2kwohwXQPxw/9OCxp05r5ghVBFSyioixx3gfkNQ==" - }, - "asynckit": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=" - }, - "aws-sign2": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz", - "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg=" - }, - "aws4": { - "version": "1.11.0", - "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.11.0.tgz", - "integrity": "sha512-xh1Rl34h6Fi1DC2WWKfxUTVqRsNnr6LsKz2+hfwDxQJWmrx8+c7ylaqBMcHfl1U1r2dsifOvKX3LQuLNZ+XSvA==" - }, - "balanced-match": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", - "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" - }, - "base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" - }, - "bcrypt-pbkdf": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", - "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=", - "requires": { - "tweetnacl": "^0.14.3" - } - }, - "bl": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", - "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", - "requires": { - "buffer": "^5.5.0", - "inherits": "^2.0.4", - "readable-stream": "^3.4.0" - } - }, - "bluebird": { - "version": "2.11.0", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz", - "integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE=" - }, - "boolbase": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", - "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" - }, - "brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "requires": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "brotli": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/brotli/-/brotli-1.3.2.tgz", - "integrity": "sha1-UlqcrU/LqWR119OI9q7LE+7VL0Y=", - "requires": { - "base64-js": "^1.1.2" - } - }, - "browser-process-hrtime": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/browser-process-hrtime/-/browser-process-hrtime-1.0.0.tgz", - "integrity": "sha512-9o5UecI3GhkpM6DrXr69PblIuWxPKk9Y0jHBRhdocZ2y7YECBFCsHm79Pr3OyR2AvjhDkabFJaDJMYRazHgsow==" - }, - "browser-request": { - "version": "github:postlight/browser-request#38faa5b85741aabfca61aa37d1ef044d68969ddf", - "from": "github:postlight/browser-request#feat-add-headers-to-response", - "requires": { - "http-headers": "^3.0.1" - } - }, - "buffer": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", - "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", - "requires": { - "base64-js": "^1.3.1", - "ieee754": "^1.1.13" - } - }, - "buffer-crc32": { - "version": "0.2.13", - "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", - "integrity": "sha1-DTM+PwDqxQqhRUq9MO+MKl2ackI=" - }, - "camelcase": { - "version": "5.3.1", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", - "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==" - }, - "caseless": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", - "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=" - }, - "cheerio": { - "version": "0.22.0", - "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-0.22.0.tgz", - "integrity": "sha1-qbqoYKP5tZWmuBsahocxIe06Jp4=", - "requires": { - "css-select": "~1.2.0", - "dom-serializer": "~0.1.0", - "entities": "~1.1.1", - "htmlparser2": "^3.9.1", - "lodash.assignin": "^4.0.9", - "lodash.bind": "^4.1.4", - "lodash.defaults": "^4.0.1", - "lodash.filter": "^4.4.0", - "lodash.flatten": "^4.2.0", - "lodash.foreach": "^4.3.0", - "lodash.map": "^4.4.0", - "lodash.merge": "^4.4.0", - "lodash.pick": "^4.2.1", - "lodash.reduce": "^4.4.0", - "lodash.reject": "^4.4.0", - "lodash.some": "^4.4.0" - } - }, - "chownr": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", - "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" - }, - "cliui": { - "version": "7.0.4", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz", - "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==", - "requires": { - "string-width": "^4.2.0", - "strip-ansi": "^6.0.0", - "wrap-ansi": "^7.0.0" - } - }, - "color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "requires": { - "color-name": "~1.1.4" - } - }, - "color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==" - }, - "combined-stream": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", - "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "requires": { - "delayed-stream": "~1.0.0" - } - }, - "concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" - }, - "core-js": { - "version": "2.6.12", - "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.6.12.tgz", - "integrity": "sha512-Kb2wC0fvsWfQrgk8HU5lW6U/Lcs8+9aaYcy4ZFc6DDlo4nZ7n70dEgE5rtR0oG6ufKDUnrwfWL1mXR5ljDatrQ==" - }, - "core-util-is": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" - }, - "css-select": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/css-select/-/css-select-1.2.0.tgz", - "integrity": "sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=", - "requires": { - "boolbase": "~1.0.0", - "css-what": "2.1", - "domutils": "1.5.1", - "nth-check": "~1.0.1" - } - }, - "css-what": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz", - "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==" - }, - "cssom": { - "version": "0.3.8", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz", - "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" - }, - "cssstyle": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-1.4.0.tgz", - "integrity": "sha512-GBrLZYZ4X4x6/QEoBnIrqb8B/f5l4+8me2dkom/j1Gtbxy0kBv6OGzKuAsGM75bkGwGAFkt56Iwg28S3XTZgSA==", - "requires": { - "cssom": "0.3.x" - } - }, - "dashdash": { - "version": "1.14.1", - "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", - "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", - "requires": { - "assert-plus": "^1.0.0" - } - }, - "data-urls": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-1.1.0.tgz", - "integrity": "sha512-YTWYI9se1P55u58gL5GkQHW4P6VJBJ5iBT+B5a7i2Tjadhv52paJG0qHX4A0OR6/t52odI64KP2YvFpkDOi3eQ==", - "requires": { - "abab": "^2.0.0", - "whatwg-mimetype": "^2.2.0", - "whatwg-url": "^7.0.0" - }, - "dependencies": { - "whatwg-url": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-7.1.0.tgz", - "integrity": "sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==", - "requires": { - "lodash.sortby": "^4.7.0", - "tr46": "^1.0.1", - "webidl-conversions": "^4.0.2" - } - } - } - }, - "debug": { - "version": "4.3.1", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", - "integrity": "sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==", - "requires": { - "ms": "2.1.2" - } - }, - "decamelize": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", - "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=" - }, - "decimal.js": { - "version": "10.2.1", - "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.2.1.tgz", - "integrity": "sha512-KaL7+6Fw6i5A2XSnsbhm/6B+NuEA7TZ4vqxnd5tXz9sbKtrN9Srj8ab4vKVdK8YAqZO9P1kg45Y6YLoduPf+kw==" - }, - "deep-is": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.3.tgz", - "integrity": "sha1-s2nW+128E+7PUk+RsHD+7cNXzzQ=" - }, - "delayed-stream": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", - "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=" - }, - "devtools-protocol": { - "version": "0.0.818844", - "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.818844.tgz", - "integrity": "sha512-AD1hi7iVJ8OD0aMLQU5VK0XH9LDlA1+BcPIgrAxPfaibx2DbWucuyOhc4oyQCbnvDDO68nN6/LcKfqTP343Jjg==" - }, - "difflib": { - "version": "github:postlight/difflib.js#32e8e38c7fcd935241b9baab71bb432fd9b166ed", - "from": "github:postlight/difflib.js", - "requires": { - "heap": ">= 0.2.0" - } - }, - "dom-serializer": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.1.tgz", - "integrity": "sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==", - "requires": { - "domelementtype": "^1.3.0", - "entities": "^1.1.1" - } - }, - "domelementtype": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.1.tgz", - "integrity": "sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==" - }, - "domexception": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/domexception/-/domexception-1.0.1.tgz", - "integrity": "sha512-raigMkn7CJNNo6Ihro1fzG7wr3fHuYVytzquZKX5n0yizGsTcYgzdIUwj1X9pK0VvjeihV+XiclP+DjwbsSKug==", - "requires": { - "webidl-conversions": "^4.0.2" - } - }, - "domhandler": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", - "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", - "requires": { - "domelementtype": "1" - } - }, - "dompurify": { - "version": "2.2.7", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.2.7.tgz", - "integrity": "sha512-jdtDffdGNY+C76jvodNTu9jt5yYj59vuTUyx+wXdzcSwAGTYZDAQkQ7Iwx9zcGrA4ixC1syU4H3RZROqRxokxg==" - }, - "domutils": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", - "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", - "requires": { - "dom-serializer": "0", - "domelementtype": "1" - } - }, - "dotenv": { - "version": "6.2.0", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-6.2.0.tgz", - "integrity": "sha512-HygQCKUBSFl8wKQZBSemMywRWcEDNidvNbjGVyZu3nbZ8qq9ubiPoGLMdRDpfSrpkkm9BXYFkpKxxFX38o/76w==" - }, - "ecc-jsbn": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", - "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=", - "requires": { - "jsbn": "~0.1.0", - "safer-buffer": "^2.1.0" - } - }, - "ellipsize": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/ellipsize/-/ellipsize-0.1.0.tgz", - "integrity": "sha1-nUNoLUS5GtFuvYQmisEDFwplU/g=" - }, - "emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" - }, - "end-of-stream": { - "version": "1.4.4", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", - "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", - "requires": { - "once": "^1.4.0" - } - }, - "entities": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", - "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" - }, - "escalade": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz", - "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==" - }, - "escodegen": { - "version": "1.14.3", - "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.14.3.tgz", - "integrity": "sha512-qFcX0XJkdg+PB3xjZZG/wKSuT1PnQWx57+TVSjIMmILd2yC/6ByYElPwJnslDsuWuSAp4AwJGumarAAmJch5Kw==", - "requires": { - "esprima": "^4.0.1", - "estraverse": "^4.2.0", - "esutils": "^2.0.2", - "optionator": "^0.8.1", - "source-map": "~0.6.1" - } - }, - "esprima": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", - "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==" - }, - "estraverse": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz", - "integrity": "sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==" - }, - "esutils": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", - "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==" - }, - "extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" - }, - "extract-zip": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz", - "integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==", - "requires": { - "@types/yauzl": "^2.9.1", - "debug": "^4.1.1", - "get-stream": "^5.1.0", - "yauzl": "^2.10.0" - } - }, - "extsprintf": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", - "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=" - }, - "fast-deep-equal": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", - "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==" - }, - "fast-json-stable-stringify": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", - "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==" - }, - "fast-levenshtein": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", - "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=" - }, - "fd-slicer": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", - "integrity": "sha1-JcfInLH5B3+IkbvmHY85Dq4lbx4=", - "requires": { - "pend": "~1.2.0" - } - }, - "file-url": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/file-url/-/file-url-3.0.0.tgz", - "integrity": "sha512-g872QGsHexznxkIAdK8UiZRe7SkE6kvylShU4Nsj8NvfvZag7S0QuQ4IgvPDkk75HxgjIVDwycFTDAgIiO4nDA==" - }, - "find-up": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz", - "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==", - "requires": { - "locate-path": "^5.0.0", - "path-exists": "^4.0.0" - } - }, - "forever-agent": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", - "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=" - }, - "form-data": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", - "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==", - "requires": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.6", - "mime-types": "^2.1.12" - } - }, - "fs-constants": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", - "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" - }, - "fs.realpath": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" - }, - "get-caller-file": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", - "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==" - }, - "get-stream": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz", - "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==", - "requires": { - "pump": "^3.0.0" - } - }, - "getpass": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", - "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", - "requires": { - "assert-plus": "^1.0.0" - } - }, - "glob": { - "version": "7.1.6", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz", - "integrity": "sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==", - "requires": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.0.4", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - } - }, - "har-schema": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz", - "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI=" - }, - "har-validator": { - "version": "5.1.5", - "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz", - "integrity": "sha512-nmT2T0lljbxdQZfspsno9hgrG3Uir6Ks5afism62poxqBM6sDnMEuPmzTq8XN0OEwqKLLdh1jQI3qyE66Nzb3w==", - "requires": { - "ajv": "^6.12.3", - "har-schema": "^2.0.0" - } - }, - "heap": { - "version": "0.2.6", - "resolved": "https://registry.npmjs.org/heap/-/heap-0.2.6.tgz", - "integrity": "sha1-CH4fELBGky/IWU3Z5tN4r8nR5aw=" - }, - "html-encoding-sniffer": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-1.0.2.tgz", - "integrity": "sha512-71lZziiDnsuabfdYiUeWdCVyKuqwWi23L8YeIgV9jSSZHCtb6wB1BKWooH7L3tn4/FuZJMVWyNaIDr4RGmaSYw==", - "requires": { - "whatwg-encoding": "^1.0.1" - } - }, - "htmlparser2": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.10.1.tgz", - "integrity": "sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==", - "requires": { - "domelementtype": "^1.3.1", - "domhandler": "^2.3.0", - "domutils": "^1.5.1", - "entities": "^1.1.1", - "inherits": "^2.0.1", - "readable-stream": "^3.1.1" - } - }, - "http-headers": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/http-headers/-/http-headers-3.0.2.tgz", - "integrity": "sha512-87E1I+2Wg4dxxz4rcxElo3dxO/w1ZtgL1yA0Sb6vH3qU16vRKq1NjWQv9SCY3ly2OQROcoxHZOUpmelS+k6wOw==", - "requires": { - "next-line": "^1.1.0" - } - }, - "http-signature": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.5.tgz", - "integrity": "sha512-NwoTQYSJoFt34jSBbwzDHDofoA61NGXzu6wXh95o1Ry62EnmKjXb/nR/RknLeZ3G/uGwrlKNY2z7uPt+Cdl7Tw==", - "requires": { - "assert-plus": "^1.0.0", - "jsprim": "^1.2.2", - "sshpk": "^1.14.1" - } - }, - "https-proxy-agent": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz", - "integrity": "sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==", - "requires": { - "agent-base": "5", - "debug": "4" - } - }, - "iconv-lite": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.5.0.tgz", - "integrity": "sha512-NnEhI9hIEKHOzJ4f697DMz9IQEXr/MMJ5w64vN2/4Ai+wRnvV7SBrL0KLoRlwaKVghOc7LQ5YkPLuX146b6Ydw==", - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - }, - "ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==" - }, - "immediate": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", - "integrity": "sha1-nbHb0Pr43m++D13V5Wu2BigN5ps=" - }, - "inflight": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", - "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", - "requires": { - "once": "^1.3.0", - "wrappy": "1" - } - }, - "inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" - }, - "is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==" - }, - "is-potential-custom-element-name": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.0.tgz", - "integrity": "sha1-DFLlS8yjkbssSUsh6GJtczbG45c=" - }, - "is-typedarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", - "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=" - }, - "isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" - }, - "isstream": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", - "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=" - }, - "jsbn": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", - "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=" - }, - "jsdom": { - "version": "11.12.0", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-11.12.0.tgz", - "integrity": "sha512-y8Px43oyiBM13Zc1z780FrfNLJCXTL40EWlty/LXUtcjykRBNgLlCjWXpfSPBl2iv+N7koQN+dvqszHZgT/Fjw==", - "requires": { - "abab": "^2.0.0", - "acorn": "^5.5.3", - "acorn-globals": "^4.1.0", - "array-equal": "^1.0.0", - "cssom": ">= 0.3.2 < 0.4.0", - "cssstyle": "^1.0.0", - "data-urls": "^1.0.0", - "domexception": "^1.0.1", - "escodegen": "^1.9.1", - "html-encoding-sniffer": "^1.0.2", - "left-pad": "^1.3.0", - "nwsapi": "^2.0.7", - "parse5": "4.0.0", - "pn": "^1.1.0", - "request": "^2.87.0", - "request-promise-native": "^1.0.5", - "sax": "^1.2.4", - "symbol-tree": "^3.2.2", - "tough-cookie": "^2.3.4", - "w3c-hr-time": "^1.0.1", - "webidl-conversions": "^4.0.2", - "whatwg-encoding": "^1.0.3", - "whatwg-mimetype": "^2.1.0", - "whatwg-url": "^6.4.1", - "ws": "^5.2.0", - "xml-name-validator": "^3.0.0" - } - }, - "json-schema": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", - "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=" - }, - "json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" - }, - "json-stringify-safe": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", - "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=" - }, - "jsprim": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz", - "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", - "requires": { - "assert-plus": "1.0.0", - "extsprintf": "1.3.0", - "json-schema": "0.2.3", - "verror": "1.10.0" - } - }, - "jszip": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.6.0.tgz", - "integrity": "sha512-jgnQoG9LKnWO3mnVNBnfhkh0QknICd1FGSrXcgrl67zioyJ4wgx25o9ZqwNtrROSflGBCGYnJfjrIyRIby1OoQ==", - "requires": { - "lie": "~3.3.0", - "pako": "~1.0.2", - "readable-stream": "~2.3.6", - "set-immediate-shim": "~1.0.1" - }, - "dependencies": { - "readable-stream": { - "version": "2.3.7", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", - "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" - }, - "string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", - "requires": { - "safe-buffer": "~5.1.0" - } - } - } - }, - "left-pad": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", - "integrity": "sha512-XI5MPzVNApjAyhQzphX8BkmKsKUxD4LdyK24iZeQGinBN9yTQT3bFlCBy/aVx2HrNcqQGsdot8ghrjyrvMCoEA==" - }, - "levn": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz", - "integrity": "sha1-OwmSTt+fCDwEkP3UwLxEIeBHZO4=", - "requires": { - "prelude-ls": "~1.1.2", - "type-check": "~0.3.2" - } - }, - "lie": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", - "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", - "requires": { - "immediate": "~3.0.5" - } - }, - "locate-path": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz", - "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==", - "requires": { - "p-locate": "^4.1.0" - } - }, - "lodash": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", - "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" - }, - "lodash.assignin": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/lodash.assignin/-/lodash.assignin-4.2.0.tgz", - "integrity": "sha1-uo31+4QesKPoBEIysOJjqNxqKKI=" - }, - "lodash.bind": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/lodash.bind/-/lodash.bind-4.2.1.tgz", - "integrity": "sha1-euMBfpOWIqwxt9fX3LGzTbFpDTU=" - }, - "lodash.defaults": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz", - "integrity": "sha1-0JF4cW/+pN3p5ft7N/bwgCJ0WAw=" - }, - "lodash.filter": { - "version": "4.6.0", - "resolved": "https://registry.npmjs.org/lodash.filter/-/lodash.filter-4.6.0.tgz", - "integrity": "sha1-ZosdSYFgOuHMWm+nYBQ+SAtMSs4=" - }, - "lodash.flatten": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/lodash.flatten/-/lodash.flatten-4.4.0.tgz", - "integrity": "sha1-8xwiIlqWMtK7+OSt2+8kCqdlph8=" - }, - "lodash.foreach": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/lodash.foreach/-/lodash.foreach-4.5.0.tgz", - "integrity": "sha1-Gmo16s5AEoDH8G3d7DUWWrJ+PlM=" - }, - "lodash.map": { - "version": "4.6.0", - "resolved": "https://registry.npmjs.org/lodash.map/-/lodash.map-4.6.0.tgz", - "integrity": "sha1-dx7Hg540c9nEzeKLGTlMNWL09tM=" - }, - "lodash.merge": { - "version": "4.6.2", - "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", - "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" - }, - "lodash.pick": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/lodash.pick/-/lodash.pick-4.4.0.tgz", - "integrity": "sha1-UvBWEP/53tQiYRRB7R/BI6AwAbM=" - }, - "lodash.reduce": { - "version": "4.6.0", - "resolved": "https://registry.npmjs.org/lodash.reduce/-/lodash.reduce-4.6.0.tgz", - "integrity": "sha1-8atrg5KZrUj3hKu/R2WW8DuRTTs=" - }, - "lodash.reject": { - "version": "4.6.0", - "resolved": "https://registry.npmjs.org/lodash.reject/-/lodash.reject-4.6.0.tgz", - "integrity": "sha1-gNZJLcFHCGS79YNTO2UfQqn1JBU=" - }, - "lodash.some": { - "version": "4.6.0", - "resolved": "https://registry.npmjs.org/lodash.some/-/lodash.some-4.6.0.tgz", - "integrity": "sha1-G7nzFO9ri63tE7VJFpsqlF62jk0=" - }, - "lodash.sortby": { - "version": "4.7.0", - "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", - "integrity": "sha1-7dFMgk4sycHgsKG0K7UhBRakJDg=" - }, - "mime-db": { - "version": "1.47.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.47.0.tgz", - "integrity": "sha512-QBmA/G2y+IfeS4oktet3qRZ+P5kPhCKRXxXnQEudYqUaEioAU1/Lq2us3D/t1Jfo4hE9REQPrbB7K5sOczJVIw==" - }, - "mime-types": { - "version": "2.1.30", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.30.tgz", - "integrity": "sha512-crmjA4bLtR8m9qLpHvgxSChT+XoSlZi8J4n/aIdn3z92e/U47Z0V/yl+Wh9W046GgFVAmoNR/fmdbZYcSSIUeg==", - "requires": { - "mime-db": "1.47.0" - } - }, - "minimatch": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", - "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", - "requires": { - "brace-expansion": "^1.1.7" - } - }, - "mkdirp-classic": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", - "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" - }, - "moment-parseformat": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/moment-parseformat/-/moment-parseformat-3.0.0.tgz", - "integrity": "sha512-dVgXe6b6DLnv4CHG7a1zUe5mSXaIZ3c6lSHm/EKeVeQI2/4pwe0VRde8OyoCE1Ro2lKT5P6uT9JElF7KDLV+jw==" - }, - "ms": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", - "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" - }, - "next-line": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/next-line/-/next-line-1.1.0.tgz", - "integrity": "sha1-/K5XhTBStqm66CCOQN19PC0wRgM=" - }, - "node-fetch": { - "version": "2.6.1", - "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz", - "integrity": "sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==" - }, - "nth-check": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", - "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==", - "requires": { - "boolbase": "~1.0.0" - } - }, - "nwsapi": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.0.tgz", - "integrity": "sha512-h2AatdwYH+JHiZpv7pt/gSX1XoRGb7L/qSIeuqA6GwYoF9w1vP1cw42TO0aI2pNyshRK5893hNSl+1//vHK7hQ==" - }, - "oauth-sign": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", - "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==" - }, - "once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", - "requires": { - "wrappy": "1" - } - }, - "optionator": { - "version": "0.8.3", - "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz", - "integrity": "sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==", - "requires": { - "deep-is": "~0.1.3", - "fast-levenshtein": "~2.0.6", - "levn": "~0.3.0", - "prelude-ls": "~1.1.2", - "type-check": "~0.3.2", - "word-wrap": "~1.2.3" - } - }, - "os-tmpdir": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", - "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=" - }, - "p-limit": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", - "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", - "requires": { - "p-try": "^2.0.0" - } - }, - "p-locate": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz", - "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==", - "requires": { - "p-limit": "^2.2.0" - } - }, - "p-try": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", - "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==" - }, - "pako": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", - "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" - }, - "parse5": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-4.0.0.tgz", - "integrity": "sha512-VrZ7eOd3T1Fk4XWNXMgiGBK/z0MG48BWG2uQNU4I72fkQuKUTZpl+u9k+CxEG0twMVzSmXEEz12z5Fnw1jIQFA==" - }, - "path-exists": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", - "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==" - }, - "path-is-absolute": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", - "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" - }, - "pend": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", - "integrity": "sha1-elfrVQpng/kRUzH89GY9XI4AelA=" - }, - "performance-now": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", - "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=" - }, - "pkg-dir": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz", - "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==", - "requires": { - "find-up": "^4.0.0" - } - }, - "pn": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/pn/-/pn-1.1.0.tgz", - "integrity": "sha512-2qHaIQr2VLRFoxe2nASzsV6ef4yOOH+Fi9FBOVH6cqeSgUnoyySPZkxzLuzd+RYOQTRpROA0ztTMqxROKSb/nA==" - }, - "postman-request": { - "version": "2.88.1-postman.29", - "resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.29.tgz", - "integrity": "sha512-QuL3+AvGlmPLb1Qf0t/rM8M4U8LCYbADZBijUNToLl6l37i65KH8wY1gTLWLxlw2I6ugxUfX2Zyyk5/J5HFZIg==", - "requires": { - "@postman/form-data": "~3.1.1", - "@postman/tunnel-agent": "^0.6.3", - "aws-sign2": "~0.7.0", - "aws4": "^1.8.0", - "brotli": "~1.3.2", - "caseless": "~0.12.0", - "combined-stream": "~1.0.6", - "extend": "~3.0.2", - "forever-agent": "~0.6.1", - "har-validator": "~5.1.3", - "http-signature": "~1.3.1", - "is-typedarray": "~1.0.0", - "isstream": "~0.1.2", - "json-stringify-safe": "~5.0.1", - "mime-types": "~2.1.19", - "oauth-sign": "~0.9.0", - "performance-now": "^2.1.0", - "qs": "~6.5.2", - "safe-buffer": "^5.1.2", - "stream-length": "^1.0.2", - "tough-cookie": "~2.5.0", - "uuid": "^3.3.2" - } - }, - "prelude-ls": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz", - "integrity": "sha1-IZMqVJ9eUv/ZqCf1cOBL5iqX2lQ=" - }, - "process-nextick-args": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", - "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" - }, - "progress": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", - "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" - }, - "proxy-from-env": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", - "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" - }, - "psl": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.8.0.tgz", - "integrity": "sha512-RIdOzyoavK+hA18OGGWDqUTsCLhtA7IcZ/6NCs4fFJaHBDab+pDDmDIByWFRQJq2Cd7r1OoQxBGKOaztq+hjIQ==" - }, - "pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==" - }, - "puppeteer-core": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-5.5.0.tgz", - "integrity": "sha512-tlA+1n+ziW/Db03hVV+bAecDKse8ihFRXYiEypBe9IlLRvOCzYFG6qrCMBYK34HO/Q/Ecjc+tvkHRAfLVH+NgQ==", - "requires": { - "debug": "^4.1.0", - "devtools-protocol": "0.0.818844", - "extract-zip": "^2.0.0", - "https-proxy-agent": "^4.0.0", - "node-fetch": "^2.6.1", - "pkg-dir": "^4.2.0", - "progress": "^2.0.1", - "proxy-from-env": "^1.0.0", - "rimraf": "^3.0.2", - "tar-fs": "^2.0.0", - "unbzip2-stream": "^1.3.3", - "ws": "^7.2.3" - }, - "dependencies": { - "ws": { - "version": "7.4.4", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.4.4.tgz", - "integrity": "sha512-Qm8k8ojNQIMx7S+Zp8u/uHOx7Qazv3Yv4q68MiWWWOJhiwG5W3x7iqmRtJo8xxrciZUY4vRxUTJCKuRnF28ZZw==" - } - } - }, - "qs": { - "version": "6.5.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", - "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" - }, - "querystring": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz", - "integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA=" - }, - "readability-extractor": { - "version": "git+https://github.com/ArchiveBox/readability-extractor.git#42b243843c724a5d7a6b364d23985ff6acaeb55a", - "from": "git+https://github.com/ArchiveBox/readability-extractor.git", - "requires": { - "@mozilla/readability": "^0.4.1", - "dompurify": "^2.2.7", - "jsdom": "^16.5.2" - }, - "dependencies": { - "acorn": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.1.0.tgz", - "integrity": "sha512-LWCF/Wn0nfHOmJ9rzQApGnxnvgfROzGilS8936rqN/lfcYkY9MYZzdMqN+2NJ4SlTc+m5HiSa+kNfDtI64dwUA==" - }, - "acorn-globals": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-6.0.0.tgz", - "integrity": "sha512-ZQl7LOWaF5ePqqcX4hLuv/bLXYQNfNWw2c0/yX/TsPRKamzHcTGQnlCjHT3TsmkOUVEPS3crCxiPfdzE/Trlhg==", - "requires": { - "acorn": "^7.1.1", - "acorn-walk": "^7.1.1" - }, - "dependencies": { - "acorn": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", - "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==" - } - } - }, - "acorn-walk": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.2.0.tgz", - "integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA==" - }, - "cssom": { - "version": "0.4.4", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.4.4.tgz", - "integrity": "sha512-p3pvU7r1MyyqbTk+WbNJIgJjG2VmTIaB10rI93LzVPrmDJKkzKYMtxxyAvQXR/NS6otuzveI7+7BBq3SjBS2mw==" - }, - "cssstyle": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.3.0.tgz", - "integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==", - "requires": { - "cssom": "~0.3.6" - }, - "dependencies": { - "cssom": { - "version": "0.3.8", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz", - "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" - } - } - }, - "data-urls": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-2.0.0.tgz", - "integrity": "sha512-X5eWTSXO/BJmpdIKCRuKUgSCgAN0OwliVK3yPKbwIWU1Tdw5BRajxlzMidvh+gwko9AfQ9zIj52pzF91Q3YAvQ==", - "requires": { - "abab": "^2.0.3", - "whatwg-mimetype": "^2.3.0", - "whatwg-url": "^8.0.0" - } - }, - "domexception": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/domexception/-/domexception-2.0.1.tgz", - "integrity": "sha512-yxJ2mFy/sibVQlu5qHjOkf9J3K6zgmCxgJ94u2EdvDOV09H+32LtRswEcUsmUWN72pVLOEnTSRaIVVzVQgS0dg==", - "requires": { - "webidl-conversions": "^5.0.0" - }, - "dependencies": { - "webidl-conversions": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz", - "integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==" - } - } - }, - "escodegen": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.0.0.tgz", - "integrity": "sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==", - "requires": { - "esprima": "^4.0.1", - "estraverse": "^5.2.0", - "esutils": "^2.0.2", - "optionator": "^0.8.1", - "source-map": "~0.6.1" - } - }, - "estraverse": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", - "integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ==" - }, - "html-encoding-sniffer": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-2.0.1.tgz", - "integrity": "sha512-D5JbOMBIR/TVZkubHT+OyT2705QvogUW4IBn6nHd756OwieSF9aDYFj4dv6HHEVGYbHaLETa3WggZYWWMyy3ZQ==", - "requires": { - "whatwg-encoding": "^1.0.5" - } - }, - "jsdom": { - "version": "16.5.2", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.5.2.tgz", - "integrity": "sha512-JxNtPt9C1ut85boCbJmffaQ06NBnzkQY/MWO3YxPW8IWS38A26z+B1oBvA9LwKrytewdfymnhi4UNH3/RAgZrg==", - "requires": { - "abab": "^2.0.5", - "acorn": "^8.1.0", - "acorn-globals": "^6.0.0", - "cssom": "^0.4.4", - "cssstyle": "^2.3.0", - "data-urls": "^2.0.0", - "decimal.js": "^10.2.1", - "domexception": "^2.0.1", - "escodegen": "^2.0.0", - "html-encoding-sniffer": "^2.0.1", - "is-potential-custom-element-name": "^1.0.0", - "nwsapi": "^2.2.0", - "parse5": "6.0.1", - "request": "^2.88.2", - "request-promise-native": "^1.0.9", - "saxes": "^5.0.1", - "symbol-tree": "^3.2.4", - "tough-cookie": "^4.0.0", - "w3c-hr-time": "^1.0.2", - "w3c-xmlserializer": "^2.0.0", - "webidl-conversions": "^6.1.0", - "whatwg-encoding": "^1.0.5", - "whatwg-mimetype": "^2.3.0", - "whatwg-url": "^8.5.0", - "ws": "^7.4.4", - "xml-name-validator": "^3.0.0" - } - }, - "parse5": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz", - "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==" - }, - "tough-cookie": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz", - "integrity": "sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg==", - "requires": { - "psl": "^1.1.33", - "punycode": "^2.1.1", - "universalify": "^0.1.2" - } - }, - "tr46": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-2.0.2.tgz", - "integrity": "sha512-3n1qG+/5kg+jrbTzwAykB5yRYtQCTqOGKq5U5PE3b0a1/mzo6snDhjGS0zJVJunO0NrT3Dg1MLy5TjWP/UJppg==", - "requires": { - "punycode": "^2.1.1" - } - }, - "webidl-conversions": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-6.1.0.tgz", - "integrity": "sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w==" - }, - "whatwg-url": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.5.0.tgz", - "integrity": "sha512-fy+R77xWv0AiqfLl4nuGUlQ3/6b5uNfQ4WAbGQVMYshCTCCPK9psC1nWh3XHuxGVCtlcDDQPQW1csmmIQo+fwg==", - "requires": { - "lodash": "^4.7.0", - "tr46": "^2.0.2", - "webidl-conversions": "^6.1.0" - } - }, - "ws": { - "version": "7.4.4", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.4.4.tgz", - "integrity": "sha512-Qm8k8ojNQIMx7S+Zp8u/uHOx7Qazv3Yv4q68MiWWWOJhiwG5W3x7iqmRtJo8xxrciZUY4vRxUTJCKuRnF28ZZw==" - } - } - }, - "readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "requires": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - } - }, - "regenerator-runtime": { - "version": "0.13.7", - "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz", - "integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew==" - }, - "request": { - "version": "2.88.2", - "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz", - "integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==", - "requires": { - "aws-sign2": "~0.7.0", - "aws4": "^1.8.0", - "caseless": "~0.12.0", - "combined-stream": "~1.0.6", - "extend": "~3.0.2", - "forever-agent": "~0.6.1", - "form-data": "~2.3.2", - "har-validator": "~5.1.3", - "http-signature": "~1.2.0", - "is-typedarray": "~1.0.0", - "isstream": "~0.1.2", - "json-stringify-safe": "~5.0.1", - "mime-types": "~2.1.19", - "oauth-sign": "~0.9.0", - "performance-now": "^2.1.0", - "qs": "~6.5.2", - "safe-buffer": "^5.1.2", - "tough-cookie": "~2.5.0", - "tunnel-agent": "^0.6.0", - "uuid": "^3.3.2" - }, - "dependencies": { - "http-signature": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz", - "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=", - "requires": { - "assert-plus": "^1.0.0", - "jsprim": "^1.2.2", - "sshpk": "^1.7.0" - } - } - } - }, - "request-promise": { - "version": "4.2.6", - "resolved": "https://registry.npmjs.org/request-promise/-/request-promise-4.2.6.tgz", - "integrity": "sha512-HCHI3DJJUakkOr8fNoCc73E5nU5bqITjOYFMDrKHYOXWXrgD/SBaC7LjwuPymUprRyuF06UK7hd/lMHkmUXglQ==", - "requires": { - "bluebird": "^3.5.0", - "request-promise-core": "1.1.4", - "stealthy-require": "^1.1.1", - "tough-cookie": "^2.3.3" - }, - "dependencies": { - "bluebird": { - "version": "3.7.2", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", - "integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==" - } - } - }, - "request-promise-core": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.4.tgz", - "integrity": "sha512-TTbAfBBRdWD7aNNOoVOBH4pN/KigV6LyapYNNlAPA8JwbovRti1E88m3sYAwsLi5ryhPKsE9APwnjFTgdUjTpw==", - "requires": { - "lodash": "^4.17.19" - } - }, - "request-promise-native": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/request-promise-native/-/request-promise-native-1.0.9.tgz", - "integrity": "sha512-wcW+sIUiWnKgNY0dqCpOZkUbF/I+YPi+f09JZIDa39Ec+q82CpSYniDp+ISgTTbKmnpJWASeJBPZmoxH84wt3g==", - "requires": { - "request-promise-core": "1.1.4", - "stealthy-require": "^1.1.1", - "tough-cookie": "^2.3.3" - } - }, - "require-directory": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", - "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=" - }, - "rimraf": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", - "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", - "requires": { - "glob": "^7.1.3" - } - }, - "safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==" - }, - "safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" - }, - "sax": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", - "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" - }, - "saxes": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz", - "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==", - "requires": { - "xmlchars": "^2.2.0" - } - }, - "selenium-webdriver": { - "version": "4.0.0-alpha.7", - "resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.0.0-alpha.7.tgz", - "integrity": "sha512-D4qnTsyTr91jT8f7MfN+OwY0IlU5+5FmlO5xlgRUV6hDEV8JyYx2NerdTEqDDkNq7RZDYc4VoPALk8l578RBHw==", - "requires": { - "jszip": "^3.2.2", - "rimraf": "^2.7.1", - "tmp": "0.0.30" - }, - "dependencies": { - "rimraf": { - "version": "2.7.1", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.7.1.tgz", - "integrity": "sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w==", - "requires": { - "glob": "^7.1.3" - } - } - } - }, - "set-immediate-shim": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/set-immediate-shim/-/set-immediate-shim-1.0.1.tgz", - "integrity": "sha1-SysbJ+uAip+NzEgaWOXlb1mfP2E=" - }, - "single-file": { - "version": "git+https://github.com/gildas-lormeau/SingleFile.git#ec9dbc7c2272bff0dc2415a44d6cdfb2b48aa7d2", - "from": "git+https://github.com/gildas-lormeau/SingleFile.git", - "requires": { - "file-url": "^3.0.0", - "iconv-lite": "^0.6.2", - "jsdom": "^16.4.0", - "puppeteer-core": "^5.3.0", - "selenium-webdriver": "4.0.0-alpha.7", - "strong-data-uri": "^1.0.6", - "yargs": "^16.2.0" - }, - "dependencies": { - "acorn": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.1.0.tgz", - "integrity": "sha512-LWCF/Wn0nfHOmJ9rzQApGnxnvgfROzGilS8936rqN/lfcYkY9MYZzdMqN+2NJ4SlTc+m5HiSa+kNfDtI64dwUA==" - }, - "acorn-globals": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-6.0.0.tgz", - "integrity": "sha512-ZQl7LOWaF5ePqqcX4hLuv/bLXYQNfNWw2c0/yX/TsPRKamzHcTGQnlCjHT3TsmkOUVEPS3crCxiPfdzE/Trlhg==", - "requires": { - "acorn": "^7.1.1", - "acorn-walk": "^7.1.1" - }, - "dependencies": { - "acorn": { - "version": "7.4.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", - "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==" - } - } - }, - "acorn-walk": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.2.0.tgz", - "integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA==" - }, - "cssom": { - "version": "0.4.4", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.4.4.tgz", - "integrity": "sha512-p3pvU7r1MyyqbTk+WbNJIgJjG2VmTIaB10rI93LzVPrmDJKkzKYMtxxyAvQXR/NS6otuzveI7+7BBq3SjBS2mw==" - }, - "cssstyle": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.3.0.tgz", - "integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==", - "requires": { - "cssom": "~0.3.6" - }, - "dependencies": { - "cssom": { - "version": "0.3.8", - "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz", - "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" - } - } - }, - "data-urls": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-2.0.0.tgz", - "integrity": "sha512-X5eWTSXO/BJmpdIKCRuKUgSCgAN0OwliVK3yPKbwIWU1Tdw5BRajxlzMidvh+gwko9AfQ9zIj52pzF91Q3YAvQ==", - "requires": { - "abab": "^2.0.3", - "whatwg-mimetype": "^2.3.0", - "whatwg-url": "^8.0.0" - } - }, - "domexception": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/domexception/-/domexception-2.0.1.tgz", - "integrity": "sha512-yxJ2mFy/sibVQlu5qHjOkf9J3K6zgmCxgJ94u2EdvDOV09H+32LtRswEcUsmUWN72pVLOEnTSRaIVVzVQgS0dg==", - "requires": { - "webidl-conversions": "^5.0.0" - }, - "dependencies": { - "webidl-conversions": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz", - "integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==" - } - } - }, - "escodegen": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.0.0.tgz", - "integrity": "sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==", - "requires": { - "esprima": "^4.0.1", - "estraverse": "^5.2.0", - "esutils": "^2.0.2", - "optionator": "^0.8.1", - "source-map": "~0.6.1" - } - }, - "estraverse": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", - "integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ==" - }, - "html-encoding-sniffer": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-2.0.1.tgz", - "integrity": "sha512-D5JbOMBIR/TVZkubHT+OyT2705QvogUW4IBn6nHd756OwieSF9aDYFj4dv6HHEVGYbHaLETa3WggZYWWMyy3ZQ==", - "requires": { - "whatwg-encoding": "^1.0.5" - } - }, - "iconv-lite": { - "version": "0.6.2", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.2.tgz", - "integrity": "sha512-2y91h5OpQlolefMPmUlivelittSWy0rP+oYVpn6A7GwVHNE8AWzoYOBNmlwks3LobaJxgHCYZAnyNo2GgpNRNQ==", - "requires": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - } - }, - "jsdom": { - "version": "16.5.2", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.5.2.tgz", - "integrity": "sha512-JxNtPt9C1ut85boCbJmffaQ06NBnzkQY/MWO3YxPW8IWS38A26z+B1oBvA9LwKrytewdfymnhi4UNH3/RAgZrg==", - "requires": { - "abab": "^2.0.5", - "acorn": "^8.1.0", - "acorn-globals": "^6.0.0", - "cssom": "^0.4.4", - "cssstyle": "^2.3.0", - "data-urls": "^2.0.0", - "decimal.js": "^10.2.1", - "domexception": "^2.0.1", - "escodegen": "^2.0.0", - "html-encoding-sniffer": "^2.0.1", - "is-potential-custom-element-name": "^1.0.0", - "nwsapi": "^2.2.0", - "parse5": "6.0.1", - "request": "^2.88.2", - "request-promise-native": "^1.0.9", - "saxes": "^5.0.1", - "symbol-tree": "^3.2.4", - "tough-cookie": "^4.0.0", - "w3c-hr-time": "^1.0.2", - "w3c-xmlserializer": "^2.0.0", - "webidl-conversions": "^6.1.0", - "whatwg-encoding": "^1.0.5", - "whatwg-mimetype": "^2.3.0", - "whatwg-url": "^8.5.0", - "ws": "^7.4.4", - "xml-name-validator": "^3.0.0" - } - }, - "parse5": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz", - "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==" - }, - "tough-cookie": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.0.0.tgz", - "integrity": "sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg==", - "requires": { - "psl": "^1.1.33", - "punycode": "^2.1.1", - "universalify": "^0.1.2" - } - }, - "tr46": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-2.0.2.tgz", - "integrity": "sha512-3n1qG+/5kg+jrbTzwAykB5yRYtQCTqOGKq5U5PE3b0a1/mzo6snDhjGS0zJVJunO0NrT3Dg1MLy5TjWP/UJppg==", - "requires": { - "punycode": "^2.1.1" - } - }, - "webidl-conversions": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-6.1.0.tgz", - "integrity": "sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w==" - }, - "whatwg-url": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.5.0.tgz", - "integrity": "sha512-fy+R77xWv0AiqfLl4nuGUlQ3/6b5uNfQ4WAbGQVMYshCTCCPK9psC1nWh3XHuxGVCtlcDDQPQW1csmmIQo+fwg==", - "requires": { - "lodash": "^4.7.0", - "tr46": "^2.0.2", - "webidl-conversions": "^6.1.0" - } - }, - "ws": { - "version": "7.4.4", - "resolved": "https://registry.npmjs.org/ws/-/ws-7.4.4.tgz", - "integrity": "sha512-Qm8k8ojNQIMx7S+Zp8u/uHOx7Qazv3Yv4q68MiWWWOJhiwG5W3x7iqmRtJo8xxrciZUY4vRxUTJCKuRnF28ZZw==" - } - } - }, - "source-map": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", - "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", - "optional": true - }, - "sshpk": { - "version": "1.16.1", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz", - "integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==", - "requires": { - "asn1": "~0.2.3", - "assert-plus": "^1.0.0", - "bcrypt-pbkdf": "^1.0.0", - "dashdash": "^1.12.0", - "ecc-jsbn": "~0.1.1", - "getpass": "^0.1.1", - "jsbn": "~0.1.0", - "safer-buffer": "^2.0.2", - "tweetnacl": "~0.14.0" - } - }, - "stealthy-require": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/stealthy-require/-/stealthy-require-1.1.1.tgz", - "integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=" - }, - "stream-length": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz", - "integrity": "sha1-gnfzy+5JpNqrz9tOL0qbXp8snwA=", - "requires": { - "bluebird": "^2.6.2" - } - }, - "string-direction": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/string-direction/-/string-direction-0.1.2.tgz", - "integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo=" - }, - "string-width": { - "version": "4.2.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.2.tgz", - "integrity": "sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==", - "requires": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.0" - } - }, - "string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "requires": { - "safe-buffer": "~5.2.0" - } - }, - "strip-ansi": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", - "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", - "requires": { - "ansi-regex": "^5.0.0" - } - }, - "strong-data-uri": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/strong-data-uri/-/strong-data-uri-1.0.6.tgz", - "integrity": "sha512-zhzBZev0uhT2IrFUerenXhfaE0vFUYwAZsnG0gIKGpfM/Gi6jOUQ3cmcvyTsXeDLIPiTubHESeO7EbD6FoPmzw==", - "requires": { - "truncate": "^2.0.1" - } - }, - "symbol-tree": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", - "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" - }, - "tar-fs": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", - "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", - "requires": { - "chownr": "^1.1.1", - "mkdirp-classic": "^0.5.2", - "pump": "^3.0.0", - "tar-stream": "^2.1.4" - } - }, - "tar-stream": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", - "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", - "requires": { - "bl": "^4.0.3", - "end-of-stream": "^1.4.1", - "fs-constants": "^1.0.0", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1" - } - }, - "through": { - "version": "2.3.8", - "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", - "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=" - }, - "tmp": { - "version": "0.0.30", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.30.tgz", - "integrity": "sha1-ckGdSovn1s51FI/YsyTlk6cRwu0=", - "requires": { - "os-tmpdir": "~1.0.1" - } - }, - "tough-cookie": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz", - "integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==", - "requires": { - "psl": "^1.1.28", - "punycode": "^2.1.1" - } - }, - "tr46": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", - "integrity": "sha1-qLE/1r/SSJUZZ0zN5VujaTtwbQk=", - "requires": { - "punycode": "^2.1.0" - } - }, - "truncate": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/truncate/-/truncate-2.1.0.tgz", - "integrity": "sha512-em3E3SUDONOjTBcZ36DTm3RvDded3IRU9rX32oHwwXNt3rJD5MVaFlJTQvs8tJoHRoeYP36OuQ1eL/Q7bNEWIQ==" - }, - "tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=", - "requires": { - "safe-buffer": "^5.0.1" - } - }, - "turndown": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/turndown/-/turndown-5.0.3.tgz", - "integrity": "sha512-popfGXEiedpq6F5saRIAThKxq/bbEPVFnsDnUdjaDGIre9f3/OL9Yi/yPbPcZ7RYUDpekghr666bBfZPrwNnhQ==", - "requires": { - "jsdom": "^11.9.0" - } - }, - "tweetnacl": { - "version": "0.14.5", - "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", - "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=" - }, - "type-check": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz", - "integrity": "sha1-WITKtRLPHTVeP7eE8wgEsrUg23I=", - "requires": { - "prelude-ls": "~1.1.2" - } - }, - "unbzip2-stream": { - "version": "1.4.3", - "resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz", - "integrity": "sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==", - "requires": { - "buffer": "^5.2.1", - "through": "^2.3.8" - } - }, - "universalify": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz", - "integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==" - }, - "uri-js": { - "version": "4.4.1", - "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", - "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==", - "requires": { - "punycode": "^2.1.0" - } - }, - "url": { - "version": "0.11.0", - "resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz", - "integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=", - "requires": { - "punycode": "1.3.2", - "querystring": "0.2.0" - }, - "dependencies": { - "punycode": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz", - "integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0=" - } - } - }, - "util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" - }, - "uuid": { - "version": "3.4.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz", - "integrity": "sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==" - }, - "valid-url": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/valid-url/-/valid-url-1.0.9.tgz", - "integrity": "sha1-HBRHm0DxOXp1eC8RXkCGRHQzogA=" - }, - "verror": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz", - "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=", - "requires": { - "assert-plus": "^1.0.0", - "core-util-is": "1.0.2", - "extsprintf": "^1.2.0" - } - }, - "w3c-hr-time": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.2.tgz", - "integrity": "sha512-z8P5DvDNjKDoFIHK7q8r8lackT6l+jo/Ye3HOle7l9nICP9lf1Ci25fy9vHd0JOWewkIFzXIEig3TdKT7JQ5fQ==", - "requires": { - "browser-process-hrtime": "^1.0.0" - } - }, - "w3c-xmlserializer": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-2.0.0.tgz", - "integrity": "sha512-4tzD0mF8iSiMiNs30BiLO3EpfGLZUT2MSX/G+o7ZywDzliWQ3OPtTZ0PTC3B3ca1UAf4cJMHB+2Bf56EriJuRA==", - "requires": { - "xml-name-validator": "^3.0.0" - } - }, - "webidl-conversions": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", - "integrity": "sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==" - }, - "whatwg-encoding": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-1.0.5.tgz", - "integrity": "sha512-b5lim54JOPN9HtzvK9HFXvBma/rnfFeqsic0hSpjtDbVxR3dJKLc+KB4V6GgiGOvl7CY/KNh8rxSo9DKQrnUEw==", - "requires": { - "iconv-lite": "0.4.24" - }, - "dependencies": { - "iconv-lite": { - "version": "0.4.24", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", - "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - } - } - }, - "whatwg-mimetype": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz", - "integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g==" - }, - "whatwg-url": { - "version": "6.5.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-6.5.0.tgz", - "integrity": "sha512-rhRZRqx/TLJQWUpQ6bmrt2UV4f0HCQ463yQuONJqC6fO2VoEb1pTYddbe59SkYq87aoM5A3bdhMZiUiVws+fzQ==", - "requires": { - "lodash.sortby": "^4.7.0", - "tr46": "^1.0.1", - "webidl-conversions": "^4.0.2" - } - }, - "word-wrap": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", - "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==" - }, - "wrap-ansi": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", - "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", - "requires": { - "ansi-styles": "^4.0.0", - "string-width": "^4.1.0", - "strip-ansi": "^6.0.0" - } - }, - "wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" - }, - "ws": { - "version": "5.2.2", - "resolved": "https://registry.npmjs.org/ws/-/ws-5.2.2.tgz", - "integrity": "sha512-jaHFD6PFv6UgoIVda6qZllptQsMlDEJkTQcybzzXDYM1XO9Y8em691FGMPmM46WGyLU4z9KMgQN+qrux/nhlHA==", - "requires": { - "async-limiter": "~1.0.0" - } - }, - "wuzzy": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.6.tgz", - "integrity": "sha512-x1lDcj0VvzJ1ygDpd9LWMnQVei6gEkUbCcZUG8TPnXhlPbaQWQa32ab/6xbm/samxJ2T3Y2+P3xHeeQIAcEvqQ==", - "requires": { - "lodash": "^4.17.15" - } - }, - "xml-name-validator": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-3.0.0.tgz", - "integrity": "sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw==" - }, - "xmlchars": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", - "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" - }, - "y18n": { - "version": "5.0.6", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.6.tgz", - "integrity": "sha512-PlVX4Y0lDTN6E2V4ES2tEdyvXkeKzxa8c/vo0pxPr/TqbztddTP0yn7zZylIyiAuxerqj0Q5GhpJ1YJCP8LaZQ==" - }, - "yargs": { - "version": "16.2.0", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz", - "integrity": "sha512-D1mvvtDG0L5ft/jGWkLpG1+m0eQxOfaBvTNELraWj22wSVUMWxZUvYgJYcKh6jGGIkJFhH4IZPQhR4TKpc8mBw==", - "requires": { - "cliui": "^7.0.2", - "escalade": "^3.1.1", - "get-caller-file": "^2.0.5", - "require-directory": "^2.1.1", - "string-width": "^4.2.0", - "y18n": "^5.0.5", - "yargs-parser": "^20.2.2" - }, - "dependencies": { - "yargs-parser": { - "version": "20.2.7", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.7.tgz", - "integrity": "sha512-FiNkvbeHzB/syOjIUxFDCnhSfzAL8R5vs40MgLFBorXACCOAEaWu0gRZl14vG8MR9AOJIZbmkjhusqBYZ3HTHw==" - } - } - }, - "yargs-parser": { - "version": "13.1.2", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz", - "integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==", - "requires": { - "camelcase": "^5.0.0", - "decamelize": "^1.2.0" - } - }, - "yauzl": { - "version": "2.10.0", - "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz", - "integrity": "sha1-x+sXyT4RLLEIb6bY5R+wZnt5pfk=", - "requires": { - "buffer-crc32": "~0.2.3", - "fd-slicer": "~1.1.0" - } - } - } -} diff --git a/package.json b/package.json deleted file mode 100644 index b3cc70c3d1..0000000000 --- a/package.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "archivebox", - "version": "0.6.2", - "description": "ArchiveBox: The self-hosted internet archive", - "author": "Nick Sweeting ", - "repository": "github:ArchiveBox/ArchiveBox", - "license": "MIT", - "dependencies": { - "@postlight/mercury-parser": "^2.2.0", - "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git", - "single-file": "git+https://github.com/gildas-lormeau/SingleFile.git" - } -} diff --git a/pip_dist b/pip_dist deleted file mode 160000 index 534998571c..0000000000 --- a/pip_dist +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 534998571c9a2ddff462a9c8f3ed5ea825f91958 diff --git a/pkg/debian/archivebox b/pkg/debian/archivebox new file mode 100755 index 0000000000..0ebccabcd3 --- /dev/null +++ b/pkg/debian/archivebox @@ -0,0 +1,15 @@ +#!/bin/bash +# /usr/bin/archivebox - wrapper script installed by the archivebox .deb package +# Activates the pip-installed virtualenv and runs archivebox CLI + +ARCHIVEBOX_VENV="/opt/archivebox/venv" + +if [ ! -f "$ARCHIVEBOX_VENV/bin/archivebox" ]; then + echo "Error: ArchiveBox is not installed in $ARCHIVEBOX_VENV" + echo "Try running: sudo /opt/archivebox/install.sh" + exit 1 +fi + +# Export venv bin to PATH so bundled console scripts (yt-dlp, etc.) are discoverable +export PATH="$ARCHIVEBOX_VENV/bin:$PATH" +exec "$ARCHIVEBOX_VENV/bin/archivebox" "$@" diff --git a/pkg/debian/archivebox.service b/pkg/debian/archivebox.service new file mode 100644 index 0000000000..af8b4500d4 --- /dev/null +++ b/pkg/debian/archivebox.service @@ -0,0 +1,20 @@ +# The archivebox user/group and /var/lib/archivebox directory are created by +# postinstall.sh (which runs after dpkg unpacks the package contents). + +[Unit] +Description=ArchiveBox Web Archiving Server +After=network.target + +[Service] +Type=simple +User=archivebox +Group=archivebox +WorkingDirectory=/var/lib/archivebox +Environment="PATH=/opt/archivebox/venv/bin:/usr/local/bin:/usr/bin:/bin" +ExecStartPre=/usr/bin/archivebox init +ExecStart=/usr/bin/archivebox server 0.0.0.0:8000 +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/pkg/debian/install.sh b/pkg/debian/install.sh new file mode 100755 index 0000000000..1c72758eff --- /dev/null +++ b/pkg/debian/install.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# /opt/archivebox/install.sh - installs/upgrades archivebox into its virtualenv +# Called by the postinstall script and can be run manually to upgrade + +set -e + +ARCHIVEBOX_VENV="/opt/archivebox/venv" +ARCHIVEBOX_VERSION="${ARCHIVEBOX_VERSION:-}" + +# ArchiveBox requires Python >= 3.13 (per pyproject.toml). +# Prefer python3.13 explicitly; fall back to python3 with a version check. +if command -v python3.13 >/dev/null 2>&1; then + PYTHON="python3.13" +elif command -v python3 >/dev/null 2>&1; then + PYTHON="python3" + PY_MAJOR="$("$PYTHON" -c 'import sys; print(sys.version_info.major)')" + PY_MINOR="$("$PYTHON" -c 'import sys; print(sys.version_info.minor)')" + if [ "$PY_MAJOR" -lt 3 ] || { [ "$PY_MAJOR" -eq 3 ] && [ "$PY_MINOR" -lt 13 ]; }; then + PY_VER="${PY_MAJOR}.${PY_MINOR}" + echo "[!] Error: ArchiveBox requires Python >= 3.13, but found Python $PY_VER" + echo " Install python3.13: sudo apt install python3.13 python3.13-venv" + exit 1 + fi +else + echo "[!] Error: python3 not found. Install python3.13: sudo apt install python3.13 python3.13-venv" + exit 1 +fi + +echo "[+] Setting up ArchiveBox virtualenv in $ARCHIVEBOX_VENV (using $PYTHON)..." + +# Create the virtualenv if it doesn't exist +if [ ! -d "$ARCHIVEBOX_VENV" ]; then + "$PYTHON" -m venv "$ARCHIVEBOX_VENV" +fi + +# Upgrade pip inside the virtualenv +"$ARCHIVEBOX_VENV/bin/python3" -m pip install --quiet --upgrade pip setuptools + +# Install or upgrade archivebox. +# ARCHIVEBOX_VERSION is set by postinstall.sh from the .deb package version. +# When run manually without it, install the latest release from PyPI. +if [ -n "$ARCHIVEBOX_VERSION" ]; then + echo "[+] Installing archivebox==$ARCHIVEBOX_VERSION..." + "$ARCHIVEBOX_VENV/bin/pip" install --quiet --upgrade "archivebox==$ARCHIVEBOX_VERSION" +else + echo "[+] Installing latest archivebox (no version pinned)..." + "$ARCHIVEBOX_VENV/bin/pip" install --quiet --upgrade archivebox +fi + +echo "[√] ArchiveBox installed successfully." +echo " Run 'archivebox version' to verify." diff --git a/pkg/debian/nfpm.yaml b/pkg/debian/nfpm.yaml new file mode 100644 index 0000000000..fa3df26c08 --- /dev/null +++ b/pkg/debian/nfpm.yaml @@ -0,0 +1,69 @@ +# nFPM configuration for building ArchiveBox .deb packages +# Docs: https://nfpm.goreleaser.com/configuration/ +# Usage: nfpm package --config pkg/debian/nfpm.yaml --packager deb --target dist/ + +name: archivebox +arch: "${ARCH:-amd64}" +platform: linux +version: "${VERSION}" +version_schema: semver +maintainer: "Nick Sweeting " +description: | + Self-hosted internet archiving solution. + Save pages from the web including HTML, PDF, screenshots, media, and more. + Install with: sudo apt install archivebox && archivebox init --setup +vendor: "ArchiveBox" +homepage: "https://archivebox.io" +license: "MIT" +section: "web" +priority: "optional" + +depends: + # python3 >= 3.11 allows dpkg to install on more systems (e.g. Ubuntu 24.04). + # install.sh enforces the real >= 3.13 requirement at venv creation time, + # failing early with a clear error if only an older python3 is available. + - python3 (>= 3.11) + - python3-pip + - python3-venv + # All other runtime deps (node, chrome, yt-dlp, etc.) are installed on-demand + # by `archivebox install` and should NOT be declared as package dependencies. + +recommends: + # Common utilities used by archivebox extractors. Declared as recommends + # (not depends) so dpkg doesn't hard-fail if they're missing, but apt + # installs them by default so users have a working baseline out of the box. + - git + - curl + - wget + +contents: + # Wrapper script for /usr/bin/archivebox + - src: pkg/debian/archivebox + dst: /usr/bin/archivebox + file_info: + mode: 0755 + + # Install helper script + - src: pkg/debian/install.sh + dst: /opt/archivebox/install.sh + file_info: + mode: 0755 + + # Systemd service file + - src: pkg/debian/archivebox.service + dst: /usr/lib/systemd/system/archivebox.service + file_info: + mode: 0644 + + # Create data directory (unpacked as root; postinstall.sh chowns to archivebox user) + - dst: /var/lib/archivebox + type: dir + file_info: + mode: 0755 + +scripts: + postinstall: pkg/debian/scripts/postinstall.sh + preremove: pkg/debian/scripts/preremove.sh + +deb: + compression: zstd diff --git a/pkg/debian/scripts/postinstall.sh b/pkg/debian/scripts/postinstall.sh new file mode 100755 index 0000000000..a175c1b597 --- /dev/null +++ b/pkg/debian/scripts/postinstall.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# postinstall script for archivebox .deb package +set -e + +# Create archivebox system user if it doesn't exist +if ! id -u archivebox >/dev/null 2>&1; then + useradd --system --shell /bin/bash --home-dir /var/lib/archivebox --create-home archivebox + echo "[+] Created archivebox system user" +fi + +# Ensure data directory exists and is owned by archivebox +mkdir -p /var/lib/archivebox +chown archivebox:archivebox /var/lib/archivebox + +# Run the virtualenv install script, pinning to the .deb package version +ARCHIVEBOX_VERSION="$(dpkg-query -W -f='${Version}' archivebox 2>/dev/null || echo '')" +export ARCHIVEBOX_VERSION +/opt/archivebox/install.sh + +# Reload systemd to pick up the service file (skip if systemd is not running) +if command -v systemctl >/dev/null 2>&1 && [ -d /run/systemd/system ]; then + systemctl daemon-reload + + # On upgrade: restart the service if it was enabled (prerm stopped it) + if [ "$1" = "configure" ] && systemctl is-enabled archivebox >/dev/null 2>&1; then + systemctl start archivebox 2>/dev/null || true + echo "[+] Restarted archivebox service after upgrade" + else + echo "[i] To start ArchiveBox: sudo systemctl start archivebox" + echo "[i] To enable on boot: sudo systemctl enable archivebox" + fi +fi diff --git a/pkg/debian/scripts/preremove.sh b/pkg/debian/scripts/preremove.sh new file mode 100755 index 0000000000..65fc7330cc --- /dev/null +++ b/pkg/debian/scripts/preremove.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# preremove script for archivebox .deb package +set -e + +# dpkg passes "$1" as "remove", "purge", or "upgrade". + +# Always stop the service before removing or upgrading, because postinstall +# replaces the venv in-place — the running process would use stale binaries. +if command -v systemctl >/dev/null 2>&1 && [ -d /run/systemd/system ]; then + systemctl stop archivebox 2>/dev/null || true +fi + +# Only disable + clean up on full removal, not during upgrade. +if [ "$1" = "remove" ] || [ "$1" = "purge" ]; then + if command -v systemctl >/dev/null 2>&1 && [ -d /run/systemd/system ]; then + systemctl disable archivebox 2>/dev/null || true + fi + + echo "[+] Removing ArchiveBox virtualenv..." + rm -rf /opt/archivebox/venv + + echo "[i] Your ArchiveBox data in /var/lib/archivebox has NOT been removed." + echo " The 'archivebox' system user has NOT been removed." + echo " Remove them manually if you no longer need them:" + echo " sudo rm -rf /var/lib/archivebox" + echo " sudo userdel archivebox" +fi diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000000..a8c24bc133 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,308 @@ +[project] +name = "archivebox" +version = "0.9.29rc1" +requires-python = ">=3.13" +description = "Self-hosted internet archiving solution." +authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}] +license = {text = "MIT"} +readme = "README.md" +keywords = ["internet archiving", "web archiving", "digipres", "warc", "preservation", "backups", "archiving", "web", "bookmarks", "puppeteer", "browser", "download"] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Environment :: Web Environment", + "Framework :: Django", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Information Technology", + "Intended Audience :: Legal Industry", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + "Topic :: Internet :: WWW/HTTP :: WSGI :: Application", + "Topic :: Sociology :: History", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: System :: Archiving", + "Topic :: System :: Archiving :: Backup", + "Topic :: System :: Recovery Tools", + "Topic :: Utilities", + "Typing :: Typed", +] + + +dependencies = [ + ### Django libraries + "setuptools>=74.1.0", # for: django 5 on python >=3.12, distutils is no longer in stdlib but django 5.1 expects distutils (TODO: check if this can be removed eventually) + "django>=6.0", + "daphne>=4.2.1", # ASGI server for Django (no channels needed - websockets not used) + "django-ninja>=1.5.1", + "django-extensions>=3.2.3", + "django-signal-webhooks>=0.3.0", + "django-admin-data-views>=0.4.1", + "django-object-actions>=4.3.0", + "django-taggit==6.1.0", # TODO: remove this in favor of KVTags only + ### State Management + "python-statemachine>=2.3.6", + ### CLI / Logging + "click>=8.3.1", # for: nicer CLI command + argument definitions + "rich>=14.2.0", # for: pretty CLI output + "rich-click>=1.9.5", # for: pretty CLI command help text & output + "ipython>=8.27.0", # for: archivebox shell (TODO: replace with bpython?) + ### Host OS / System + "supervisor>=4.2.5", # for: archivebox server starting daphne and workers + "psutil>=6.0.0", # for: monitoring orchestractor,actors,workers,etc. and machine.models.Process + "platformdirs>=4.3.6", # for: finding a xdg-config dir to store tmp/lib files in + "py-machineid>=0.6.0", # for: machine/detect.py calculating unique machine guid + "atomicwrites==1.4.1", # for: config file writes, index.json file writes, etc. (TODO: remove this deprecated lib in favor of archivebox.filestore.util/os.rename/os.replace) + ### Base Types + "pydantic>=2.8.0", # for: archivebox.api (django-ninja), archivebox.config (pydantic-settings), and archivebox.index.schema (pydantic) + "pydantic-settings>=2.5.2", # for: archivebox.config + "python-benedict[io,parse]>=0.33.2", # for: dict replacement all over the codebase to allow .attr-style access + "base32-crockford>=0.3.0", # for: encoding UUIDs in base32 + ### Static Typing + "django-stubs>=5.0.4", # for: vscode type hints on models and common django APIs + ### API clients + "requests>=2.32.3", # for: fetching title, static files, headers (TODO: replace with httpx?) + "sonic-client>=1.0.0", + ### Parsers + "dateparser>=1.2.0", # for: parsing pocket/pinboard/etc. RSS/bookmark import dates + "croniter>=6.0.0", # for: validating and computing crawl schedules + "tzdata>=2024.2", # needed for dateparser {TZ: UTC} on some systems: https://github.com/ArchiveBox/ArchiveBox/issues/1553 + "w3lib>=2.2.1", # used for parsing content-type encoding from http response headers & html tags + ### Extractor dependencies (optional binaries detected at runtime via shutil.which) + ### Binary/Package Management + "abxbus>=2.4.9", # explicit direct dep so local dev env resolves sibling abxbus repo, matching abx-dl EventBus API + "abx-pkg>=1.9.27", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm + "abx-plugins>=1.10.29", # shared ArchiveBox plugin package with install_args-only overrides + "abx-dl>=1.10.29", # shared ArchiveBox downloader package with install_args-only overrides + ### UUID7 backport for Python <3.14 + "uuid7>=0.1.0; python_version < '3.14'", # provides the uuid_extensions module on Python 3.13 +] + +[project.optional-dependencies] +sonic = [ + # sonic client lib now included by default, sonic group is now a no-op: + # "sonic-client>=1.0.0", + + # to use sonic make sure you have a sonic server running in docker (archivebox/sonic) or locally: + # echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list + # curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg + # apt install sonic +] +ldap = [ + # python-ldap depends on the openldap bindings which provide no prebuilt wheels because they link against tons of other system packages + # apt install build-essential python3-dev python3-ldap libsasl2-dev libldap2-dev libssl-dev + "python-ldap>=3.4.3", + "django-auth-ldap>=4.1.0", +] +debug = [ + # packages needed for running with DEBUG=True + "django-debug-toolbar>=4.4.6", + "djdt_flamegraph>=0.2.13", + "ipdb>=0.13.13", + "requests-tracker>=0.3.3", + "django-autotyping>=0.5.1", +] +all = [ + "archivebox[sonic,ldap,debug]" +] + +[dependency-groups] +dev = [ + ### BUILD + "uv>=0.11.3", + "bumpver>=2023.1129", + #"homebrew-pypi-poet>=0.10.0", # for: generating archivebox.rb brewfile list of python packages + ### DOCS + "recommonmark>=0.7.1", + "sphinx>=8.1.3", + "sphinx-rtd-theme>=2.0.0", + "myst-parser>=4.0.0", + "sphinx-autodoc2>=0.5.0", + "linkify-it-py>=2.0.3", + ### DEBUGGING + "django-debug-toolbar>=4.4.6", + "requests-tracker>=0.3.3", + "djdt_flamegraph>=0.2.13", + "ipdb>=0.13.13", + "logfire[django]>=0.51.0", + "opentelemetry-instrumentation-django>=0.47b0", + "opentelemetry-instrumentation-sqlite3>=0.47b0", + "viztracer>=0.17.0", # usage: viztracer ../.venv/bin/archivebox manage check + # "snakeviz", # usage: python -m cProfile -o flamegraph.prof ../.venv/bin/archivebox manage check + ### TESTING + "pytest-django>=4.11.1", + "pytest>=8.3.3", + "pytest-cov>=6.0.0", + "pytest-httpserver>=1.1.0", + "coverage[toml]>=7.6.0", + "bottle>=0.13.1", + ### LINTING + "prek>=0.3.6", + "ruff>=0.6.6", + "pyright>=1.1.406", + "ty>=0.0.1a19", +] + +[tool.uv] +environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"] +package = true +exclude-newer = "5 days" +exclude-newer-package = { abx-plugins = "1 second", abx-dl = "1 second", abx-pkg = "1 second", abxbus = "1 second" } +# compile-bytecode = true + +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" +# https://github.com/astral-sh/uv/issues/3957 + +[tool.setuptools] +packages = ["archivebox"] +package-dir = {"archivebox" = "archivebox"} + +[tool.ruff] +line-length = 140 +target-version = "py313" +src = ["archivebox"] +exclude = ["*.pyi", "typings/", "migrations/", "archivebox/tests/data/"] + +# https://docs.astral.sh/ruff/rules/ +[tool.ruff.lint] +ignore = ["E731", "E303", "E266", "E241", "E222"] + +[tool.codespell] +ignore-words-list = "abx,archivebox,adminsnapshots,bu,wit,dont,cant,wont,havent,thats,shouldnt,doesnt,doenst,re-use,re-used,re-using,re-usable" +skip = "*.json,*.min.js,*.min.css,uv.lock,old/*,website/*" + +[tool.pytest.ini_options] +testpaths = [ "archivebox/tests" ] +norecursedirs = ["archivebox/tests/data"] +DJANGO_SETTINGS_MODULE = "archivebox.core.settings" +# Note: Plugin tests under abx_plugins/plugins/ must NOT load Django +# They use a conftest.py to disable Django automatically + +[tool.coverage.run] +# Enable branch coverage (tracks if/else branches) +branch = true +# What to measure +source = ["archivebox"] +# Support parallel execution (for integration tests, dev server, etc.) +parallel = true +# Store data in .coverage instead of .coverage. +data_file = ".coverage" +# What to exclude +omit = [ + "*/tests/*", + "*/test_*.py", + "*/migrations/*", + "*/typings/*", + "*/__pycache__/*", + "*/node_modules/*", + "*/.venv/*", + "*/manage.py", +] + +[tool.coverage.report] +# Show lines missing coverage +show_missing = true +# Skip files with no executable code +skip_empty = true +# Fail if coverage below this (set to 0 for now) +fail_under = 0 +# Exclude patterns (regex) +exclude_lines = [ + # Standard pragma + "pragma: no cover", + # Don't complain about missing debug code + "def __repr__", + "if self.debug", + # Don't complain if tests don't cover defensive assertion code + "raise AssertionError", + "raise NotImplementedError", + # Don't complain if non-runnable code isn't run + "if 0:", + "if False:", + "if __name__ == .__main__.:", + # Type checking blocks + "if TYPE_CHECKING:", + # Abstract methods + "@(abc\\.)?abstractmethod", +] + +[tool.coverage.html] +directory = "htmlcov" + +[tool.coverage.json] +output = "coverage.json" +show_contexts = true + +[tool.pyright] +include = [ + "archivebox", +] +exclude = [ + ".venv", + "**/*.pyi", + "**/__init__.pyi", + "**/node_modules", + "**/__pycache__", + "**/migrations", + "archivebox/tests/data", + "archivebox/tests/data/**", +] +stubPath = "./typings" +venvPath = "." +venv = ".venv" +# ignore = ["src/oldstuff"] +# defineConstant = { DEBUG = true } +reportMissingImports = true +reportMissingTypeStubs = false +pythonVersion = "3.13" +pythonPlatform = "Linux" + +[tool.ty] +environment = { python-version = "3.13", python-platform = "linux" } +src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations", "archivebox/tests/data", "archivebox/tests/data/**"] } + + +[project.scripts] +archivebox = "archivebox.cli:main" + + +[project.urls] +Homepage = "https://github.com/ArchiveBox/ArchiveBox" +Source = "https://github.com/ArchiveBox/ArchiveBox" +Documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki" +"Bug Tracker" = "https://github.com/ArchiveBox/ArchiveBox/issues" +Changelog = "https://github.com/ArchiveBox/ArchiveBox/releases" +Roadmap = "https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap" +Community = "https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community" +Demo = "https://demo.archivebox.io" +Donate = "https://github.com/ArchiveBox/ArchiveBox/wiki/Donations" + + + +[tool.bumpver] +current_version = "v0.8.5rc53" +version_pattern = "vMAJOR.MINOR.PATCH[PYTAGNUM]" +commit_message = "bump version {old_version} -> {new_version}" +tag_message = "{new_version}" +tag_scope = "default" +pre_commit_hook = "" +post_commit_hook = "" +commit = true +tag = true +push = true + +[tool.bumpver.file_patterns] +"pyproject.toml" = [ + 'current_version = "{version}"', + 'version = "{pep440_version}"', +] diff --git a/setup.py b/setup.py deleted file mode 100755 index ebfb923379..0000000000 --- a/setup.py +++ /dev/null @@ -1,142 +0,0 @@ -import json -import setuptools -from setuptools.command.test import test - -from pathlib import Path - - -PKG_NAME = "archivebox" -DESCRIPTION = "The self-hosted internet archive." -LICENSE = "MIT" -AUTHOR = "Nick Sweeting" -AUTHOR_EMAIL="git@nicksweeting.com" -REPO_URL = "https://github.com/ArchiveBox/ArchiveBox" -PROJECT_URLS = { - "Source": f"{REPO_URL}", - "Documentation": f"{REPO_URL}/wiki", - "Bug Tracker": f"{REPO_URL}/issues", - "Changelog": f"{REPO_URL}/wiki/Changelog", - "Roadmap": f"{REPO_URL}/wiki/Roadmap", - "Community": f"{REPO_URL}/wiki/Web-Archiving-Community", - "Donate": f"{REPO_URL}/wiki/Donations", -} - -ROOT_DIR = Path(__file__).parent.resolve() -PACKAGE_DIR = ROOT_DIR / PKG_NAME - -README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore') -VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] - -PYTHON_REQUIRES = ">=3.7" -SETUP_REQUIRES = ["wheel"] -INSTALL_REQUIRES = [ - # only add things here that have corresponding apt python3-packages available - # anything added here also needs to be added to our package dependencies in - # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc. - # if there is no apt python3-package equivalent, then vendor it instead in - # ./archivebox/vendor/ - "requests>=2.24.0", - "mypy-extensions>=0.4.3", - "django>=3.1.3,<3.2", - "django-extensions>=3.0.3", - "dateparser", - "ipython", - "youtube-dl", - "python-crontab>=2.5.1", - "croniter>=0.3.34", - "w3lib>=1.22.0", -] -EXTRAS_REQUIRE = { - 'sonic': [ - "sonic-client>=0.0.5", - ], - 'dev': [ - "setuptools", - "twine", - "wheel", - "flake8", - "ipdb", - "mypy", - "django-stubs", - "sphinx", - "sphinx-rtd-theme", - "recommonmark", - "pytest", - "bottle", - "stdeb", - "django-debug-toolbar", - "djdt_flamegraph", - ], -} - -# To see when setup.py gets called (uncomment for debugging): -# import sys -# print(PACKAGE_DIR, f" (v{VERSION})") -# print('>', sys.executable, *sys.argv) - - -class DisabledTestCommand(test): - def run(self): - # setup.py test is deprecated, disable it here by force so stdeb doesnt run it - print() - print('[X] Running tests via setup.py test is deprecated.') - print(' Hint: Use the ./bin/test.sh script or pytest instead') - - -setuptools.setup( - name=PKG_NAME, - version=VERSION, - license=LICENSE, - author=AUTHOR, - author_email=AUTHOR_EMAIL, - description=DESCRIPTION, - long_description=README, - long_description_content_type="text/markdown", - url=REPO_URL, - project_urls=PROJECT_URLS, - python_requires=PYTHON_REQUIRES, - setup_requires=SETUP_REQUIRES, - install_requires=INSTALL_REQUIRES, - extras_require=EXTRAS_REQUIRE, - packages=[PKG_NAME], - include_package_data=True, # see MANIFEST.in - entry_points={ - "console_scripts": [ - f"{PKG_NAME} = {PKG_NAME}.cli:main", - ], - }, - classifiers=[ - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Development Status :: 4 - Beta", - - "Topic :: Utilities", - "Topic :: System :: Archiving", - "Topic :: System :: Archiving :: Backup", - "Topic :: System :: Recovery Tools", - "Topic :: Sociology :: History", - "Topic :: Internet :: WWW/HTTP", - "Topic :: Internet :: WWW/HTTP :: Indexing/Search", - "Topic :: Internet :: WWW/HTTP :: WSGI :: Application", - "Topic :: Software Development :: Libraries :: Python Modules", - - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: End Users/Desktop", - "Intended Audience :: Information Technology", - "Intended Audience :: Legal Industry", - "Intended Audience :: System Administrators", - - "Environment :: Console", - "Environment :: Web Environment", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Framework :: Django", - "Typing :: Typed", - ], - cmdclass={ - "test": DisabledTestCommand, - }, -) diff --git a/stdeb.cfg b/stdeb.cfg deleted file mode 100644 index 251e76c534..0000000000 --- a/stdeb.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[DEFAULT] -Source: archivebox -Package: archivebox -Package3: archivebox -Suite: focal -Suite3: focal -Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb -Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep -XS-Python-Version: >= 3.7 -Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 20128da75a..0000000000 --- a/tests/conftest.py +++ /dev/null @@ -1,19 +0,0 @@ -from multiprocessing import Process - -import pytest -from .mock_server.server import start - -server_process = None - -@pytest.hookimpl -def pytest_sessionstart(session): - global server_process - server_process = Process(target=start) - server_process.start() - -@pytest.hookimpl -def pytest_sessionfinish(session): - if server_process is not None: - server_process.terminate() - server_process.join() - \ No newline at end of file diff --git a/tests/fixtures.py b/tests/fixtures.py deleted file mode 100644 index cca722f386..0000000000 --- a/tests/fixtures.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -import subprocess - -import pytest - -@pytest.fixture -def process(tmp_path): - os.chdir(tmp_path) - process = subprocess.run(['archivebox', 'init'], capture_output=True) - return process - -@pytest.fixture -def disable_extractors_dict(): - env = os.environ.copy() - env.update({ - "USE_WGET": "false", - "USE_SINGLEFILE": "false", - "USE_READABILITY": "false", - "USE_MERCURY": "false", - "SAVE_PDF": "false", - "SAVE_SCREENSHOT": "false", - "SAVE_DOM": "false", - "SAVE_HEADERS": "false", - "USE_GIT": "false", - "SAVE_MEDIA": "false", - "SAVE_ARCHIVE_DOT_ORG": "false" - }) - return env diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py deleted file mode 100644 index 4283574f30..0000000000 --- a/tests/mock_server/server.py +++ /dev/null @@ -1,53 +0,0 @@ -from os import getcwd -from pathlib import Path - -from bottle import route, run, static_file, response, redirect - -@route("/") -def index(): - return "Hello" - -@route("/static/") -def static_path(filename): - template_path = Path.cwd().resolve() / "tests/mock_server/templates" - response = static_file(filename, root=template_path) - return response - -@route("/static_no_content_type/") -def static_no_content_type(filename): - template_path = Path.cwd().resolve() / "tests/mock_server/templates" - response = static_file(filename, root=template_path) - response.set_header("Content-Type", "") - return response - -@route("/static/headers/") -def static_path_with_headers(filename): - template_path = Path.cwd().resolve() / "tests/mock_server/templates" - response = static_file(filename, root=template_path) - response.add_header("Content-Language", "en") - response.add_header("Content-Script-Type", "text/javascript") - response.add_header("Content-Style-Type", "text/css") - return response - -@route("/static/400/", method="HEAD") -def static_400(filename): - template_path = Path.cwd().resolve() / "tests/mock_server/templates" - response = static_file(filename, root=template_path) - response.status = 400 - response.add_header("Status-Code", "400") - return response - -@route("/static/400/", method="GET") -def static_200(filename): - template_path = Path.cwd().resolve() / "tests/mock_server/templates" - response = static_file(filename, root=template_path) - response.add_header("Status-Code", "200") - return response - -@route("/redirect/headers/") -def redirect_to_static(filename): - redirect(f"/static/headers/$filename") - - -def start(): - run(host='localhost', port=8080) \ No newline at end of file diff --git a/tests/mock_server/templates/example.com.html b/tests/mock_server/templates/example.com.html deleted file mode 100644 index 8469956cd4..0000000000 --- a/tests/mock_server/templates/example.com.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - Example Domain - - - - - - - - -
    -

    Example Domain

    -

    This domain is for use in illustrative examples in documents. You may use this - domain in literature without prior coordination or asking for permission.

    -

    - More information... -

    -
    - - diff --git a/tests/mock_server/templates/iana.org.html b/tests/mock_server/templates/iana.org.html deleted file mode 100644 index c1e60a2e9c..0000000000 --- a/tests/mock_server/templates/iana.org.html +++ /dev/null @@ -1,390 +0,0 @@ - - - - IANA — IANA-managed Reserved Domains - - - - - - - - - - - - - - - - - -
    - -
    - -
    - - -
    - - -

    IANA-managed Reserved Domains

    - -

    Certain domains are set aside, and nominally registered to “IANA”, for specific - policy or technical purposes.

    - -

    Example domains

    - -

    As described in - RFC 2606 - and - RFC 6761, - a number of domains such as - example.com - and - example.org - are maintained for documentation purposes. These domains may be used as illustrative - examples in documents without prior coordination with us. They are - not available for registration or transfer.

    - -

    Test IDN top-level domains

    - -

    These domains were temporarily delegated by IANA for the - IDN Evaluation - being conducted by - ICANN.

    - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    DomainDomain (A-label)LanguageScript
    إختبار - - XN--KGBECHTV - - ArabicArabic
    آزمایشی - - XN--HGBK6AJ7F53BBA - - PersianArabic
    测试 - - XN--0ZWM56D - - ChineseHan (Simplified variant)
    測試 - - XN--G6W251D - - ChineseHan (Traditional variant)
    испытание - - XN--80AKHBYKNJ4F - - RussianCyrillic
    परीक्षा - - XN--11B5BS3A9AJ6G - - HindiDevanagari (Nagari)
    δοκιμή - - XN--JXALPDLP - - Greek, Modern (1453-)Greek
    테스트 - - XN--9T4B11YI5A - - KoreanHangul (Hangŭl, Hangeul)
    טעסט - - XN--DEBA0AD - - YiddishHebrew
    テスト - - XN--ZCKZAH - - JapaneseKatakana
    பரிட்சை - - XN--HLCJ6AYA9ESC7A - - TamilTamil
    -
    - -

    Policy-reserved domains

    - -

    We act as both the registrant and registrar for a select number of domains - which have been reserved under policy grounds. These exclusions are - typically indicated in either technical standards (RFC documents), - or - contractual limitations.

    - -

    Domains which are described as registered to IANA or ICANN on policy - grounds are not available for registration or transfer, with the exception - of - - country-name.info - domains. These domains are available for release - by the ICANN Governmental Advisory Committee Secretariat.

    - -

    Other Special-Use Domains

    - -

    There is additionally a - Special-Use Domain Names - registry documenting special-use domains designated by technical standards. For further information, see - Special-Use Domain Names - (RFC 6761).

    - - -
    - - - - -
    - - diff --git a/tests/mock_server/templates/malformed.html b/tests/mock_server/templates/malformed.html deleted file mode 100644 index 6116059db7..0000000000 --- a/tests/mock_server/templates/malformed.html +++ /dev/null @@ -1,8 +0,0 @@ - - - -malformed document - - diff --git a/tests/mock_server/templates/shift_jis.html b/tests/mock_server/templates/shift_jis.html deleted file mode 100644 index 622039a5ba..0000000000 --- a/tests/mock_server/templates/shift_jis.html +++ /dev/null @@ -1,769 +0,0 @@ - - - - - - - - - - - - Ž­Ž™“‡‚Ėƒjƒ…[ƒXbMBC“ė“ú–{•ú‘— - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -

    MBC NEWS

    - -
    -
      - -
    • - -
    • -
    • - -
    • -
    -
    -
    - - - -
    -

    07ŒŽ22“ú(…)

    -
  • -

    z–K”VŖ“‡‚Ŕ𔭁@•Ŧ‰Œ‚P‚Q‚O‚Oƒ[ƒgƒ‹ - [23:10] -

    -

    \“‡‘ē‚ːz–K”VŖ“‡‚Å‚Q‚Q“ú–éA”š”­“I•Ŧ‰Î‚Ē”­ļ‚ĩA•Ŧ‰Œ‚Ē‰ÎŒû‚Š‚į‚P‚Q‚O‚Oƒ[ƒgƒ‹‚Ė‚‚ŗ‚܂ŏã‚Ē‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    “ņ\Žlß‹Cu‘友v@Ž­Ž™“‡Žs‚Å‚R‚TD‚T“x@‰‚Ė–Ō‹“ú[20:03] -

    -

    ‚Q‚Q“ú‚Í“ņ\Žlß‹C‚Ėˆę‚u‘友v‚ŁA‚P”N‚ÅÅ‚ā‹‚ĸŽžŠú‚Æ‚ŗ‚ę‚Ü‚ˇB

    -
    -
  • -
  • -

    u‚f‚‚s‚ƒgƒ‰ƒxƒ‹vƒLƒƒƒ“ƒy[ƒ“ŠJŽn@ŒË˜f‚ĸ‚Æ•sˆĀ‚ːē‚ā[20:02] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚őŌ‚‚đŽķ‚¯‚Ä‚ĸ‚éŠĪŒõ‹ÆŠE‚đŽx‰‡‚ˇ‚鍑‚ˁu‚f‚‚s‚ƒgƒ‰ƒxƒ‹vƒLƒƒƒ“ƒy[ƒ“‚Ē‚Q‚Q“ú‚Š‚įŽn‚Ü‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚S˜A‹x‘O‚Ɂ@Ž­Ž™“‡‹ķ`‚ŐVŒ^ƒRƒƒi‘΍ô‹­‰ģ@o”­‹q‚ĖŒŸ‰ˇ‚ā[19:48] -

    -

    ‚Q‚R“ú‚Š‚į‚Ė‚S˜A‹xAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚Ė‘Îô‚đ‹­‰ģ‚ˇ‚é‚Ŋ‚߁AŽ­Ž™“‡‹ķ`‚ł̓T[ƒ‚ƒOƒ‰ƒtƒB[‚Ē‘Ũ‚ŗ‚ęAV‚Ŋ‚ɏo”­‹q‚ˑˉˇ‘Ē’č‚āŽn‚Ü‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒiV‚Ŋ‚É‚QlŠ´õ@ƒNƒ‰ƒXƒ^[—Ž‚ŋ’…‚­‚ā‘΍ôŒp‘ą‚đ[19:48] -

    -

    Ž­Ž™“‡Œ§“ā‚ł͂Q‚Q“úAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚ǐV‚Ŋ‚É‚QlŠm”F‚ŗ‚ęA—ŨŒv‚Í‚P‚V‚Sl‚Æ‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‹L˜^“I‘å‰J‚Å”íŠQ@Ž­Ž™“‡Œ§ˆÉ˛Žs‚đ]“Ą”_…‘Š‚ĒŽ‹Ž@[19:47] -

    -

    ĄŒŽã{‚Ė‹L˜^“I‘å‰J‚Å‘å‚̂ȔíŠQ‚đŽķ‚¯‚ŊŽ­Ž™“‡Œ§ˆÉ˛Žs‚đ‚Q‚Q“úA]“Ą‘ņ”_—Ґ…ŽY‘åb‚Ē–K‚ęA”_‹Æ”íŠQ‚Ėķ‹ĩ‚Č‚Į‚đŠm”F‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Z–ė‹…h‘ã‘Ö‘å‰īh ŒˆŸƒg[ƒiƒƒ“ƒg‚ĒŠJ–‹[19:46] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚Å’†Ž~‚Æ‚Č‚Á‚ŊŽ­Ž™“‡Œ§‚Ė‰Ä‚Ė‚Z–ė‹…‚Ė‘ã‘Ö‘å‰ī‚́A‚Q‚Q“ú‚Š‚įŠe’n‹æ‚Ė‘ã•\‚P‚UZ‚É‚æ‚錈Ÿƒg[ƒiƒƒ“ƒg‚ĒŽn‚Ü‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŦŠwZ‚ĖZ’ë‚Ė–Ø‚ÅƒAƒIƒoƒYƒN‚ĒŽqˆį‚Ä’†@Ž­Ž™“‡Œ§ˆĸ‹vĒŽs[19:44] -

    -

    Ž­Ž™“‡Œ§ˆĸ‹vĒŽs‚ĖŦŠwZ‚ĖZ’ë‚ɐA‚Ļ‚į‚ę‚Ŋ–؂ŁAƒAƒIƒoƒYƒN‚ĒŽqˆį‚Ä‚đ‚ĩ‚Ä‚ĸ‚āAŠwZ‚ĖŽq‚Į‚ā‚Ŋ‚ŋ‚Ē‚ģ‚Ė—lŽq‚đŒŠŽį‚Á‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    VŽ­Ž™“‡Œ§’mŽ–E‰–“cNˆęށ‚É•ˇ‚­@V‘‡‘ĖˆįŠŲŽ”õ‚Æ–{`‹æÄŠJ”­[19:44] -

    -

    —ˆT‚Q‚W“ú‚É’mŽ–‚ɏA”C‚ˇ‚鉖“cNˆę‚ŗ‚ņ‚ɁAŒ§­‚Ė‰Û‘č‚𕡂­ƒVƒŠ[ƒYB

    -
    -
  • -
  • -

    •Ûˆį‰€Ž™‚āŽûŠn@ƒuƒhƒE‚Ė‚Í‚ŗ‚Ũ“ü‚ꎎ@ŽF–€ė“āŽs[19:43] -

    -

    Ž­Ž™“‡Œ§“ā—L”‚ĖƒuƒhƒE‚ĖŽY’nAŽF–€ė“āŽs‚ĖƒuƒhƒE‰€‚Å‚Q‚Q“úA‚Í‚ŗ‚Ũ“ü‚ꎎ‚Ēs‚í‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Œ§VŒ^ƒRƒƒi@V‚Ŋ‚É‚QlŠ´õŠm”F - [18:10] -

    -

    Ž­Ž™“‡Œ§‚Í‚Q‚Q“úAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚đV‚Ŋ‚É‚QlŠm”F‚ĩ‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ˆųH“XŒo‰cŽŌ‚į‚ǐVŒ^ƒRƒƒi‘΍ô‚đŠw‚ԁ@Ž­Ž™“‡Žs[16:14] -

    -

    Ž­Ž™“‡Žs‚Å‚Q‚Q“úAˆųH“X‚Č‚Į‚ĖŒo‰cŽŌ‚į‚ǐVŒ^ƒRƒƒi‘΍ô‚đŠw‚ԁAŒ¤C‰ī‚ĒŠJ‚Š‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ˜V•܃zƒeƒ‹‚ljc‹ÆÄŠJ@ƒv[ƒ‹ŠJ‚́@Ž­Ž™“‡Œ§ŽwhŽs[16:13] -

    -

    Ž­Ž™“‡Œ§ŽwhŽs‚Ė˜V•܃zƒeƒ‹AŽwh”’…ŠŲ‚Å–{Ši“I‚ȉĂđ‘O‚ɁAP—á‚Ėƒv[ƒ‹ŠJ‚Ģ‚Ēs‚í‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡‹ķ`‚ɃT[ƒ‚ƒOƒ‰ƒtƒB[‚R‘äŨ’u@˜A‹x‘O‚ɐVŒ^ƒRƒƒi‘΍ô‹­‰ģ[12:20] -

    -

    ‚Q‚R“ú‚Š‚į‚Ė‚S˜A‹x‚đ‘O‚ÉŽ­Ž™“‡‹ķ`‚Ė‘“āü‚ɂ́AVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŠg‘å‚đ–h‚Ž‚Ŋ‚߁AŒŸ‰ˇ—p‚ːV‚Ŋ‚ČƒT[ƒ‚ƒOƒ‰ƒtƒB[‚R‘ä‚ǐŨ’u‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi‚Å”­•\‰ī’†Ž~@ŠwZ‚Ė’†’ë‚Ń_ƒ“ƒX‚đ”â˜I[12:19] -

    -

    Ž­Ž™“‡Œ§–Žs‚Ė’†ŠwZ‚ǁAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚Ń_ƒ“ƒX”­•\‚Ė‹@‰ī‚đŽ¸‚Á‚Ŋļ“k‚ÉŠˆ–ô‚Ėę‚đ’ņ‹Ÿ‚ĩ‚悤‚ƁA”­•\‰ī‚đŠJ‚̂܂ĩ‚ŊB

    -
    -
  • -
  • -

    ŽF–€A‘å‹÷AŽíŽq“‡E‰Ž‹v’n•û‚ɍ‚‰ˇ’ˆĶî•ņ@“ú’†‚R‚T“xˆČã—\‘z[10:56] -

    -

    ŽF–€E‘å‹÷’n•ûAŽíŽq“‡E‰Ž‹v“‡’n•û‚Í‚Q‚Q“úA“ú’†‚Ė‹C‰ˇ‚Ē‚R‚T“xˆČã‚Ė–Ō‹“ú‚Æ‚Č‚é‚Æ‚ą‚ë‚Ē‚ ‚錊ž‚Ũ‚Å‚ˇB

    -
    -
  • -

    07ŒŽ21“ú(‰Î)

    -
  • -

    ‰‚”üŽsƒRƒ“ƒrƒj‹­“–ĸ‹Ž–Œ@’j‚É’Ļ–đ‚S”N‹ŒY[20:07] -

    -

    Ž­Ž™“‡Œ§‰‚”üŽs‚ŋޔN‚PŒŽAƒRƒ“ƒrƒjƒGƒ“ƒXƒXƒgƒA‚É•ī’š‚đŽ‚Á‚ĉŸ‚ĩ“ü‚čŒģ‹ā‚đ’D‚¨‚¤‚Æ‚ĩ‚Ŋ‚Æ‚ĩ‚āA‹­“–ĸ‹‚Ėß‚É–â‚í‚ę‚Ä‚ĸ‚é’j‚ĖŲ”ģ‚ĒŽ­Ž™“‡’nŲ–ŧŖŽx•”‚ÅŠJ‚Š‚ęAŒŸŽ@‚Í’j‚É’Ļ–đ‚S”N‚đ‹ŒY‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@V‚Ŋ‚É‚QlŠ´õŠm”F@Ž­Ž™“‡Œ§“ā‚P‚V‚Ql‚É[19:51] -

    -

    Ž­Ž™“‡Žs‚ŐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚ǐV‚Ŋ‚É‚QlŠm”F‚ŗ‚ęAŽ­Ž™“‡Œ§“ā‚ĖŠ´õŽŌ‚Ė—ŨŒv‚Í‚P‚V‚Ql‚Æ‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŽ­Ž™“‡Œ§’mŽ–E‰–“cNˆęށ‚É•ˇ‚­@VŒ^ƒRƒƒi‘΍ô[19:49] -

    -

    ĄŒŽ‚P‚Q“ú‚ɍs‚í‚ę‚ŊŽ­Ž™“‡Œ§’mŽ–‘I‹“‚ŏ‰“–‘I‚ĩ‚Ŋ‰–“cNˆę‚ŗ‚ņ‚́AĄŒŽ‚Q‚W“ú‚É’mŽ–‚ɏA”C‚ĩ‚Ü‚ˇB

    -
    -
  • -
  • -

    ˆę•”ŠwZ‚ʼnċx‚ŨŠJŽn@ˆę•û‚ÅŽö‹Æ‘ą‚­ŠwZ‚ā[19:48] -

    -

    Ž­Ž™“‡Œ§“ā‚Ėˆę•”‚ĖŠwZ‚ł͂Q‚P“ú‚Š‚į‰Ä‹x‚Ũ‚ĒŽn‚Ü‚č‚Ü‚ĩ‚Ŋ‚ǁAˆę•û‚ŐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ɔ炤‹xZ‚É‚æ‚éŽö‹Æ‚Ė’x‚ę‚đŽæ‚č–ß‚ˇ‚Ŋ‚߁A‚PŠwŠú‚ĖŽö‹Æ‚Ē‘ą‚ĸ‚Ä‚ĸ‚éŠwZ‚ā‚ ‚č‚Ü‚ˇB

    -
    -
  • -
  • -

    ƒlƒIƒƒCƒYœa¯@Ž­Ž™“‡‚Å‚āŽB‚Á‚ŊI[19:47] -

    -

    ŠĪ‘ĒđŒŽŸ‘æ‚ł́A“÷Šá‚ÅŒŠ‚é‚ą‚Æ‚Ē‚Å‚Ģ‚é‚Ų‚Į–ž‚é‚ĸ‚ƁAƒCƒ“ƒ^[ƒlƒbƒg‚Č‚Į‚Řb‘肯‚Č‚Á‚Ä‚ĸ‚éœa¯uƒlƒIƒƒCƒYœa¯vB

    -
    -
  • -
  • -

    ‰‚”ü‚Ė–¯—wEƒVƒ}‰S‚Ė‘æˆęlŽŌ@’ØŽR–L‚ŗ‚ņŽ€‹Ž[19:46] -

    -

    Ž­Ž™“‡Œ§“ŋ”V“‡‚Ė“Ŧ‹‚đƒ‚ƒ`[ƒt‚É‚ĩ‚ŊuƒƒCƒhßv‚Ėė‹ČŽŌ‚ŁA‰‚”ü‚Ė–¯—wEƒVƒ}‰S‚Ė‘æˆęlŽŌ‚Æ‚ĩ‚ÄŠˆ–ô‚ĩ‚Ŋ’ØŽR–L‚ŗ‚ņ‚Ē‚Q‚O“úA˜VŠ‚Ė‚Ŋ‚ß–S‚­‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚i‚qŽ­Ž™“‡–{ü@Ž­Ž™“‡’†‰›`ė“ā@ˆę•”‹æŠÔ‚Q‚V“ú‚Š‚įÄŠJ[19:38] -

    -

    ‘å‰J‚ˉe‹ŋ‚Å‚i‚qŽ­Ž™“‡–{ü‚ĖŽ­Ž™“‡’†‰›‰w‚Ɛė“ā‰w‚ĖŠÔ‚ÍA‰^“]ŒŠ‡‚킚‚Ē‘ą‚ĸ‚Ä‚ĸ‚Ü‚ˇ‚ǁAˆę•”‹æŠÔ‚Ē‚Q‚V“ú‚Š‚į—ÕŽžƒ_ƒCƒ„‚ōĊJ‚ˇ‚邹‚Æ‚É‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚¨’†Œŗ¤í@VŒ^ƒRƒƒi‚ˉe‹ŋ‚ŕΉģ‚ā@Ž­Ž™“‡Žs‚Ėƒfƒp[ƒg[19:36] -

    -

    ‚¨’†Œŗ‚Ė‹Gß‚đŒ}‚ςĂĸ‚Ü‚ˇ‚ǁAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚ā‚ ‚čAĄ”N‚Ė‚¨’†Œŗ¤í‚É‚Í•Ī‰ģ‚ā‚ ‚邿‚¤‚Å‚ˇB

    -
    -
  • -
  • -

    ŽíŽq“‡“ė“Œ‰Ģ‚Å’nk@“ėŽíŽq’Ŧ‚Ők“x‚P[18:03] -

    -

    ‚Q‚P“úŒßŒã‚TŽž‚T‚S•Ē‚˛‚ëAŽíŽq“‡“ė“Œ‰Ģ‚đkŒš’n‚Æ‚ˇ‚é’nk‚Ē‚ ‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    “y—p‰N‚Ė“ú@ƒEƒiƒMę–å“X‚É‚Ŧ‚키[16:36] -

    -

    ‚Q‚P“ú‚Í“y—p‚ˉN‚Ė“úAŽ­Ž™“‡Žs‚ĖƒEƒiƒMę–å“X‚͑吨‚Ė‹q‚łɂŦ‚í‚Á‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    ’†Šwļ‚ǁg‹ā•ôƒRƒVƒqƒJƒŠh‚ĖˆîŠ ‚č‘ĖŒą@Ž­Ž™“‡Œ§“낺‚‚܎s[16:35] -

    -

    ’´‘ę•Ä‚ĖŽY’nAŽ­Ž™“‡Œ§“낺‚‚܎s‹ā•ô’Ŧ‚ŁA’nŒŗ‚Ė’†Šwļ‚ĒˆîŠ ‚č‚đ‘ĖŒą‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ˆĻ—ĮŽs‚ĖŠé‹Æ‚ĒŽ­Ž™“‡Žs‚Ɉã—Ã}ƒXƒN‚S–œ–‡‚𑥂é[16:34] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õ—\–h‘΍ô‚ɖ𗧂ĂĂā‚ႍ‚¤‚ƁAŽ­Ž™“‡Œ§“ā‚Ń^ƒCƒ„”Ė”„Ž–‹Æ‚đŽčŠ|‚¯‚éˆĻ—ĮŽs‚ĖŠé‹Æ‚ǁAŽ­Ž™“‡Žs‚Ƀ}ƒXƒN‚S–œ–‡‚𑥂č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡EŒ§“š‚U‚R†@—L–ž–k‚h‚b[—L–ž“Œ‚h‚b@’ʍsŽ~‚ß - [15:25] -

    -

    Ž­Ž™“‡Œ§‚ĖŒ§“š‚U‚R†Žu•zŽu•ŸŽRü‚Ė—L–ž–kƒCƒ“ƒ^[‚Æ—L–ž“ŒƒCƒ“ƒ^[‚ĖŠÔ‚ĒAŠ×–v‚Ė‚Ŋ‚ß’ĘsŽ~‚ß‚Æ‚Č‚Á‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    ƒgƒ‰ƒNƒ^[‚ˉē•~‚̂ɂȂč’jĢŽ€–S@Ž­Ž™“‡Œ§“ú’uŽs[15:06] -

    -

    Ž­Ž™“‡Œ§“ú’uŽs‚Å‚Q‚P“úŒß‘OA‚—î‚Ė’jĢ‚Ēƒgƒ‰ƒNƒ^[‚ˉē•~‚̂ɂȂčAŽ€–S‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Š‚˛‚ĩ‚ܐ…‘°ŠŲ‚É‚T–œ•C‚ĖƒJƒ^ƒNƒ`ƒCƒƒV‚Ē’‡ŠÔ“ü‚č[12:00] -

    -

    ‚Q‚R“ú‚Š‚į‚Ė˜A‹x‚đ‘O‚É‚Q‚P“ú’ЁA‚Š‚˛‚ĩ‚ܐ…‘°ŠŲ‚É‚T–œ•C‚ĖƒJƒ^ƒNƒ`ƒCƒƒV‚Ē’‡ŠÔ“ü‚č‚ĩA‘‘ŦAŒQ‚ę‚đ‚Č‚ĩ‚ĉj‚Ž—lŽq‚ĒŒŠ‚į‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Zļ‚ĒŠĪŒõE–hĐ‘΍ô‚đŽs‚É’ņŒž@Ž­Ž™“‡Œ§–Žs[11:54] -

    -

    •ļ•”‰ČŠwČ‚ĖƒX[ƒp[ƒTƒCƒGƒ“ƒXƒnƒCƒXƒN[ƒ‹‚ÉŽw’肺‚ę‚Ä‚ĸ‚éAŽ­Ž™“‡Œ§–Žs‚Ė‘•Ē‚Z‚ǁAŠĪŒõ‚â–hĐ‚Č‚Į‚ɂ‚ĸ‚Ă˒ņŒž‚đŽs‚ɍs‚ĸ‚Ü‚ĩ‚ŊB

    -
    -
  • -

    07ŒŽ20“ú(ŒŽ)

    -
  • -

    Ž­Ž™“‡Žs‚Ė`‚ÅŒŠ‚Â‚Š‚Á‚Ŋˆâ‘ˁ@‚S‚VÎ’jĢ‚Æ”ģ–ž[20:26] -

    -

    Ž­Ž™“‡Žs‚Ė`‚Å‚P‚W“ú‚ÉŒŠ‚Â‚Š‚Á‚Ŋˆâ‘˂ːgŒŗ‚ɂ‚ĸ‚āAŒxŽ@‚Í‚Q‚O“úAŽs“ā‚ɏZ‚Ū‚S‚VÎ‚Ė“y–Øė‹Æˆõ‚Ė’jĢ‚ž‚Á‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    •Ŋ”N‚æ‚č‚Q‚P“ú’x‚­@‰‚”ü’n•û@ŠĪ‘ĒŽjãÅ‚ā’x‚ĸ”~‰J–ž‚¯[19:42] -

    -

    ‚Q‚O“ú‚ˉ‚”ü’n•û‚́A‘ž•Ŋ—m‚‹Cˆŗ‚É•ĸ‚í‚ę‚Ћķ‚ĒL‚Ē‚čAŽ­Ž™“‡’n•û‹CÛ‘ä‚͌ߑO‚P‚PŽž‚Ɂu‰‚”ü’n•û‚Í”~‰J–ž‚¯‚ĩ‚Ŋ‚Æ‚Ũ‚į‚ę‚év‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰‚”üE—´‹Ŋ’Ŧ‚ĖŦ’†ŠwZ‚ŏI‹ÆŽŽ@Ž­Ž™“‡Œ§“ā‚Ėˆę•”ŠwZ‚ljċx‚Ũ‚Ö[19:41] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚Å‹xZ‘[’u‚ĒŽæ‚į‚ę‚ŊŽ­Ž™“‡Œ§“ā‚ĖŒö—§ŦE’†ŠwZ‚Ė‘Ŋ‚­‚ł́A‰Ä‹x‚Ũ‚đ’Zk‚ˇ‚é•ûj‚Å‚ˇ‚ǁA—\’č’Ę‚č‚Q‚P“ú‚Š‚į‰Ä‹x‚Ũ‚É“ü‚é—Ŗ“‡‚Č‚Įˆę•”‚ĖŠwZ‚ł́A‚Q‚O“úA‚PŠwŠú‚ĖI‹ÆŽŽ‚Ēs‚í‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŠC…—ę‚ÅˆęŽž‚Sl‚Ē“M‚ę‚é@‘Sˆõ‹~•@Ž­Ž™“‡Œ§ˆĸ‹vĒŽs[19:40] -

    -

    Ž­Ž™“‡Œ§ˆĸ‹vĒŽs‚ĖŠC…—ę‚Å‚Q‚O“úŒßŒãA—Ģ‚Sl‚Ē“M‚ęA‹~•‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    uƒfƒBƒXƒJƒo[Ž­Ž™“‡v‚ĖŽŠl—vŋ‚đ‰„’ˇ@‚WŒŽ‚S“ú‚Ü‚Å[19:39] -

    -

    Ž­Ž™“‡Œ§‚͐VŒ^ƒRƒƒi‚ĖŠ´õŽŌ”‘‰Á‚đŽķ‚¯A—˜—pŽŌ‚ÉŽŠl‚đ—vŋ‚ĩ‚Ä‚ĸ‚éh”‘Ž{ŨŽx‰‡ƒLƒƒƒ“ƒy[ƒ“uƒfƒBƒXƒJƒo[Ž­Ž™“‡v‚ĖŽŠl—vŋŠúŠÔ‚đA—ˆŒŽ‚S“ú‚܂ʼn„’ˇ‚ˇ‚邹‚Æ‚đ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    uˆĀSˆĀ‘S‚Ė“V•ļŠŲ‚ɁvˆųH“X‚¨‚æ‚ģ‚T‚O“X•Ü‚ĒˆęÄÁ“Ł@Ž­Ž™“‡Žs[19:38] -

    -

    Ú‘Ō‚đ”炤ˆųH“X‚đ‘ΏۂɁAŽ­Ž™“‡Œ§‚Š‚įo‚ŗ‚ę‚Ä‚ĸ‚Ŋ‹x‹Æ—vŋ‚ĖŠúŠÔ‚ǁA–ž“ú‚Ü‚Å‚Æ‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    “ÆŽŠ‚Ė‚o‚b‚qŒŸ¸‹@Ší‚ĖŽŽŒą‰^—pŠJŽn@Ž­Ž™“‡Œ§–Žs[19:37] -

    -

    Ž­Ž™“‡Œ§–Žs‚́AVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚Ö‚ĖŠ´õ‚Ė—L–ŗ‚𒲂ׂé‚o‚b‚qŒŸ¸‹@Ší‚ˉ^—p‚đA“ÆŽŠ‚É‚Q‚O“ú‚Š‚įŽn‚߂܂ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@‘‚ĖŠî€u‘Ū‰@‘O‚É‚o‚b‚qŒŸ¸‚š‚¸v@Ē‹’‚́H[19:36] -

    -

    Ž­Ž™“‡Žs‚ĖƒVƒ‡[ƒpƒu‚ŁA‘“āÅ‘勉‚ĖƒNƒ‰ƒXƒ^[‚Ē”­ļ‚ĩAŒ§“ā‚ł͍ĄŒŽ‚É“ü‚čAˆã—Ë@ŠÖ‚Ö‚Ė“ü‰@‚âƒzƒeƒ‹‚ŗ×{‚ˇ‚él‚Ē‘‰Á‚ĩ‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    ‚t‚`‚d‚Ė‰Î¯’T¸‹@“‹Ú@‚g‚QAƒƒPƒbƒg‘Å‚ŋã‚°ŦŒ÷[19:35] -

    -

    ‚t‚`‚dƒAƒ‰ƒuŽņ’ˇ‘˜A–M‚Ė‰Î¯’T¸‹@‚đ“‹Ú‚ĩ‚Ŋ‚g‚Q‚`ƒƒPƒbƒg‚ǁAŽ­Ž™“‡Œ§‚ĖŽíŽq“‡‰F’ˆƒZƒ“ƒ^[‚Š‚į‘Å‚ŋã‚°‚į‚ęA‘Å‚ŋã‚°‚͐ŦŒ÷‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    V’ĄŽÉˆÚ“]–â‘č@Z–¯“Š•[‚đ‚WŒŽ‚X“ú‚ÉŽĀŽ{@Ž­Ž™“‡Œ§‚…Žs[19:34] -

    -

    Ž­Ž™“‡Œ§‚…Žs‚ːV‚ĩ‚ĸ’ĄŽÉ‚ĖˆÚ“]V’zŒv‰æ‚ːĨ”ņ‚đ–₤Z–¯“Š•[‚ǁA—ˆŒŽ‚X“ú‚ɍs‚í‚ę‚邹‚Æ‚É‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ƒRƒƒi‚É•‰‚¯‚Č‚ĸIƒRƒƒi‰Đ‚ŐV‚ĩ‚ĸŒ`‚ˉ^“މī[19:34] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŠg‘å‚Őæ‚ĒŒŠ‚Ļ‚Č‚ĸ•sˆĀ‚Ė’†A‹t‹Ģ‚É—§‚ŋŒü‚Š‚¤l‚âŠé‹Æ‚đĐ‰î‚ˇ‚éƒVƒŠ[ƒYuŽ­Ž™“‡”­ƒRƒƒi‚É•‰‚¯‚Č‚ĸIvĄ‰ņ‚́AƒRƒƒi‰Đ‚łːV‚ĩ‚ĸŒ`‚łˉ^“މī‚ɂ‚ĸ‚ÄŽæŪ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Q‚P“ú‚́u“y—p‰N‚Ė“úv@ƒEƒiƒM‚Ė‚Š‚ÎÄ‚Ģo‰×ƒs[ƒN@Ž­Ž™“‡Œ§‘åč’Ŧ[19:32] -

    -

    ‚Q‚P“ú‚ˁu“y—p‚ˉN‚Ė“úv‚đ‘O‚ɁAŽ­Ž™“‡Œ§‘åč’Ŧ‚ł́AƒEƒiƒM‚Ė‚Š‚ÎÄ‚Ģ‚Č‚Į‚Ėo‰×‚Ēƒs[ƒN‚đŒ}‚ςĂĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@Ž­Ž™“‡Žs‚ŐV‚Ŋ‚É‚Tl‚ĖŠ´õŠm”F@Œ§“ā‚P‚V‚Ol‚É[17:29] -

    -

    Ž­Ž™“‡Œ§“ā‚ł͂Q‚O“úAV‚Ŋ‚ɐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚Ö‚ĖŠ´õŽŌ‚Ē‚TlŠm”F‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Eė“āŒ´”­‚P†‹@@§Œä–_‹Č‚Ē‚Á‚ŊŒ´ˆö‚Í‘}“üŽž‚ĖÚG‚Š[17:11] -

    -

    ’čŠúŒŸ¸’†‚ĖŽ­Ž™“‡Œ§‚ːė“āŒ´”­‚P†‹@‚ł́AĄŒŽ‚P‚U“ú‚ÉŒ´Žq˜F‚ĖŠj•Ē—ô‚đ§Œä‚ˇ‚鐧Œä–_‚Ė‚¤‚ŋ‚Ė‚P–{‚NjȂǂÁ‚Ä‚ĸ‚é‚Ė‚ĒŒŠ‚Â‚Š‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰‚”ü’n•û@ŠĪ‘ĒŽjãÅ‚ā’x‚ĸ”~‰J–ž‚¯[11:02] -

    -

    Ž­Ž™“‡’n•û‹CÛ‘ä‚́AŒß‘O‚P‚PŽž‚Ɂu‰‚”ü’n•û‚Í”~‰J–ž‚¯‚ĩ‚Ŋ‚Æ‚Ũ‚į‚ę‚év‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚g‚Q‚`ƒƒPƒbƒg‘Å‚ŋã‚°ŦŒ÷@‚t‚`‚d‚Ė‰Î¯’T¸‹@“‹Ú[07:57] -

    -

    ‚t‚`‚dƒAƒ‰ƒuŽņ’ˇ‘˜A–M‚Ė‰Î¯’T¸‹@‚đ“‹Ú‚ĩ‚Ŋ‚g‚Q‚`ƒƒPƒbƒg‚Ē‚Q‚O“ú’ŠŽíŽq“‡‰F’ˆƒZƒ“ƒ^[‚Š‚į‘Å‚ŋã‚°‚į‚ęA‘Å‚ŋã‚°‚͐ŦŒ÷‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚g‚Q‚`ƒƒPƒbƒg‘Å‚ŋã‚°@‚t‚`‚d‚Ė‰Î¯’T¸‹@“‹Ú[07:18] -

    -

    ‚t‚`‚dƒAƒ‰ƒuŽņ’ˇ‘˜A–M‚Ė‰Î¯’T¸‹@‚đ“‹Ú‚ĩ‚Ŋ‚g‚Q‚`ƒƒPƒbƒg‚ǁAæ‚Ų‚ĮŒß‘O‚VŽž‘O‚ÉŽíŽq“‡‰F’ˆƒZƒ“ƒ^[‚Š‚į‘Å‚ŋã‚°‚į‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -

    07ŒŽ19“ú(“ú)

    -
  • -

    ‚g‚Q‚`ƒƒPƒbƒg‚S‚Q†‹@@‚Q‚O“ú’Бłŋã‚°[18:15] -

    -

    “VŒķ•s—Į‚Ė‚Ŋ‚ߑłŋã‚°‚lj„Šú‚ŗ‚ę‚Ä‚ĸ‚Ŋ‚g‚Q‚`ƒƒPƒbƒg‚S‚Q†‹@‚́A‚Q‚O“ú’ЁAŽíŽq“‡‰F’ˆƒZƒ“ƒ^[‚Š‚į‘Å‚ŋã‚°‚į‚ę‚Ü‚ˇB

    -
    -
  • -
  • -

    u‚f‚‚s‚ƒgƒ‰ƒxƒ‹v„‚č@ŽO”Ŋ‰€’mŽ–u‚Ü‚¸‚͋ߗגnˆæ‚Łv[18:13] -

    -

    Ž­Ž™“‡Œ§‚ĖŽO”Ŋ‰€’mŽ–‚́A‚P‚X“ú‚ɍs‚í‚ę‚Ŋ‘S‘’mŽ–‰ī‚ĖƒEƒFƒu‰ī‹c‚ŁA­•{‚ĒŠĪŒõŽx‰‡‚ÅŽn‚ß‚éu‚f‚‚s‚ƒgƒ‰ƒxƒ‹v‚ɂ‚ĸ‚āAuVŒ^ƒRƒƒiƒEƒCƒ‹ƒXŠ´õŠg‘å–hŽ~‚Ė‚Ŋ‚߁A‹ß—×’nˆæ‚Š‚įŽn‚ß‚é‚ׂ́v‚Æ‚Ėl‚Ļ‚đŽĻ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@Ž­Ž™“‡Œ§“āV‚Ŋ‚É‚Pl‚ĖŠ´õŠm”F[17:41] -

    -

    Ž­Ž™“‡Žs‚͐æ‚Ų‚ĮAVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚ǐV‚Ŋ‚É‚PlŠm”F‚ŗ‚ę‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‹™`‚Å’jĢ‚Ē“]—ށ@ˆĶޝ•s–ž@Ž­Ž™“‡E“낺‚‚܎s[17:30] -

    -

    Ž­Ž™“‡Œ§“낺‚‚܎s‚Ė‹™`‰Ģ‚Å‚P‚X“úŒß‘OA‘D‚Åė‹Æ’†‚Ė’jĢ‚ĒŠC‚É“]—Ž‚ĩAˆĶޝ•s–ž‚Ėd‘Ė‚Æ‚Č‚Á‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    “Œ‹žŒÜ—Ö‘ã•\E‰ĒāVƒZƒIƒ“‘IŽč@”íĐ’nŽx‰‡@Žčė‚čƒJƒŒ[’ņ‹Ÿ[11:47] -

    -

    Ž­Ž™“‡Œ§Ž­‰ŽŽsŨZ‚ŁAƒ{ƒNƒVƒ“ƒOEƒEƒGƒ‹ƒ^[‹‰‚Å“Œ‹žƒIƒŠƒ“ƒsƒbƒN‚Ė“ú–{‘ã•\‚ˉĒāVƒZƒIƒ“‘IŽč‚Ēƒvƒƒfƒ…[ƒX‚ĩ‚ŊƒJƒŒ[‚ǁAŽ­‰ŽŽs‚Ėƒzƒeƒ‹‚Å’ņ‹Ÿ‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -

    07ŒŽ18“ú(“y)

    -
  • -

    Ž­Ž™“‡Žs‚Ė`‚Å’jĢ‚Ėˆâ‘Ė[21:23] -

    -

    Ž­Ž™“‡Žs‚Ė`‚Å‚P‚W“úŒßŒãA’jĢ‚Ēˆâ‘Ė‚ÅŒŠ‚Â‚Š‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡EVŒ^ƒRƒƒiŠ´õ”­•\@‚P‚W“ú‚Í‚Ql@—ŨŒv‚P‚U‚Sl[19:16] -

    -

    Ž­Ž™“‡Œ§‚ÆŽ­Ž™“‡Žs‚͐VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚ǐV‚Ŋ‚É‚QlŠm”F‚ŗ‚ę‚Ŋ‚Æ‚P‚W“úA”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Š‚˛‚ĩ‚Ü•é‚į‚ĩ@ƒIƒ“ƒ‰ƒCƒ“ˆÚZ‘Š’k‰ī[17:29] -

    -

    Ž­Ž™“‡‚Ö‚ĖˆÚZ‚đl‚Ļ‚él‚đ‘Ώۂɂĩ‚ŊƒIƒ“ƒ‰ƒCƒ“‚Å‚ĖˆÚZ‘Š’k‰ī‚Ē‚P‚W“úAŠJ‚Š‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@Ž­Ž™“‡Žs‚ŐV‚Ŋ‚É‚Pl@Œ§“ā—ŨŒv‚P‚U‚Sl‚É[17:10] -

    -

    Ž­Ž™“‡Žs‚͐æ‚Ų‚ĮŒßŒã‚TŽž‚ɐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚ǁA‚P‚W“ú‚͐V‚Ŋ‚É‚PlŠm”F‚ŗ‚ę‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚Z–ė‹…h‘ã‘Ö‘å‰īh@’n‹æ‘ã•\‚P‚UZo‚ģ‚낤[16:02] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚Å’†Ž~‚Æ‚Č‚Á‚ŊA‰Ä‚Ė‚Z–ė‹…‚Ė‘ã‘Ö‘å‰īB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@Ž­Ž™“‡Œ§“ā‚ŏ‰‚߂ČxŽ@Н‚ĖŠ´õŠm”F[12:14] -

    -

    Œ§Œx‚ÍŒđ’Ę‹@“Ž‘ā‚ÉŠ‘Ž‚ˇ‚é‚Q‚O‘ã‚Ė’jĢŒxŽ@Н‚ǐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ÉŠ´õ‚ĩ‚Ä‚ĸ‚Ŋ‚ą‚Æ‚ĒŠm”F‚ŗ‚ę‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ’Ū‚č‚Ė’jĢ‚ĒŠC‚É“]—Ž‚ĩŽ€–S@Ž­Ž™“‡Œ§–Žs[12:12] -

    -

    Ž­Ž™“‡Œ§–Žs‚Å‚P‚V“ú–éA’Ū‚č‚đ‚ĩ‚Ä‚ĸ‚Ŋ’jĢ‚ĒŠC‚É“]—Ž‚ĩ‚ÄŽ€–S‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Œ§Œx@’jĢŒxŽ@Н‚ǐVŒ^ƒRƒƒiŠ´õ[02:16] -

    -

    Ž­Ž™“‡Œ§Œx‚Í‚P‚V“úAŒđ’Ę‹@“Ž‘ā‚Ė‚Q‚O‘ã‚Ė’jĢŒxŽ@Н‚ǐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ÉŠ´õ‚ĩ‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -

    07ŒŽ17“ú(‹ā)

    -
  • -

    Ž­Ž™“‡Œ§–{“y@‹vX‚ĖÂ‹ķ[19:48] -

    -

    ‚P‚V“ú‚ĖŽ­Ž™“‡Œ§–{“y‚́A‘Oü–k‘¤‚ĖŠŖ‚ĸ‚Ŋ‹ķ‹C‚Ē—Ŧ‚ꍾ‚ŨAÂ‹ķ‚ĒL‚Ē‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒi@Ž­Ž™“‡Œ§“ā‚ĖŠ´õŠm”F‚Č‚ĩ@‚UŒŽ‚R‚O“úˆČ—ˆ‚P‚V“ú‚Ô‚č[19:47] -

    -

    Ž­Ž™“‡Œ§“ā‚ł͂P‚V“úAV‚Ŋ‚ȐVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚Ö‚ĖŠ´õŽŌ‚ÍŠm”F‚ŗ‚ę‚Ü‚š‚ņ‚Å‚ĩ‚ŊB

    -
    -
  • -
  • -

    g“Œ‹žœŠOh‚Å‚Q‚Q“ú‚Š‚įu‚f‚@‚s‚@ƒgƒ‰ƒxƒ‹v@Šú‘Ō‚Æ•sˆĀ‚ːē[19:45] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚őŌ‚‚đŽķ‚¯‚Ä‚ĸ‚éŠĪŒõ‹Æ‚đŽx‰‡‚ˇ‚éu‚f‚‚s‚ƒgƒ‰ƒxƒ‹vƒLƒƒƒ“ƒy[ƒ“‚ɂ‚ĸ‚āA­•{‚Í—ˆT‚Q‚Q“ú‚Š‚į“Œ‹ž‚đœŠO‚ˇ‚éŒ`‚ŃXƒ^[ƒg‚ˇ‚é•ûj‚đŽĻ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚P‚X“‘S”ŧÄ@•ú‰Î‚Ėß@Á–h’cˆõ‚Ė’j‚É’Ļ–đ‚P‚Q”N‚ĖŽĀŒY”ģŒˆ[19:44] -

    -

    Ž­Ž™“‡Œ§‰‚”ü‘哇‚Ė—´‹Ŋ’Ŧ‚Å‚¨‚ƂƂĩA‹ķ‚Ģ‰Æ‚É‰Î‚đ‚‚¯AZ‘î‚Č‚Į‚P‚X“‚đ‘S”ŧÄ‚ŗ‚š‚é‚Č‚Į‚ĩ‚ŊŒģZŒš‘ĸ•¨“™•ú‰Î‚Č‚Į‚Ėß‚É–â‚í‚ę‚Ä‚ĸ‚éÁ–h’cˆõ‚ĖŲ”ģˆõŲ”ģ‚ŁA’Ļ–đ‚P‚Q”N‚ĖŽĀŒY”ģŒˆ‚ĒŒž‚ĸ“n‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŒˆŸƒg[ƒiƒƒ“ƒg–ÚŽw‚ĩ‚āI@Ž­Ž™“‡Œ§‰Ä‹G‚Z–ė‹…‘å‰ī[19:43] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚Å’†Ž~‚Æ‚Č‚Á‚ŊA‰Ä‚Ė‚Z–ė‹…‚Ė‘ã‘Ö‘å‰ī‚́A’n‹æ—\‘I‚ĖI”Õ‚đŒ}‚ςĂĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    ”­ļ‚RŽžŠÔŒã‚É”đ“īî•ņ@ŽF–€ė“āŽs‚Ė‰Íė”×”‚ÅŒŠ‚Ļ‚Ŋ‰Û‘č[19:42] -

    -

    ŽF–€ė“āŽs‚ł́AĄŒŽ‚R“ú‚ɐė“āė‚ĖŽx—Ŧ‚Ŕ×”‚Ē”­ļ‚ĩZ…”íŠQ‚āo‚Ü‚ĩ‚Ŋ‚ǁA”đ“īî•ņ‚Ēo‚Ŋ‚Ė‚Í”Ã—””­ļ‚Ė‚RŽžŠÔŒã‚Å‚ĩ‚ŊB

    -
    -
  • -
  • -

    •Ûˆį‰€‚ŁuƒEƒiƒM‹‹Hv@Ž­Ž™“‡Œ§‘åč’Ŧ[19:42] -

    -

    Ž­Ž™“‡Œ§‘åč’Ŧ‚Ė‘åŠÛ•Ûˆį‰€‚Å‚P‚V“úA‹‹H‚ɏo‚ŗ‚ę‚Ŋ‚Ė‚ÍƒEƒiƒM‚Ė‚Š‚ÎÄ‚ĢB

    -
    -
  • -
  • -

    ‚͂邺‚Æ“Á”hˆõ‚ĒŽB‚Á‚ŊIu”’‚ĸƒXƒYƒv‚Ɓu‹āF‚ĖƒhƒWƒ‡ƒEv[19:40] -

    -

    ‚l‚a‚b‚͂邺‚Æ“Á”hˆõ‚Š‚įA•Ī‚í‚Á‚ŊF‚ːļ‚Ģ•¨‚ˉf‘œ‚Ē“Í‚Ģ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰„Šú‚Ė‚g‚Q‚`ƒƒPƒbƒg@ĄŒŽ‚Q‚O“úŒß‘O‘Å‚ŋã‚°‚Ö[19:39] -

    -

    “VŒķ•s—Į‚őłŋã‚°‚lj„Šú‚ŗ‚ę‚Ä‚ĸ‚Ŋ‚g‚Q‚`ƒƒPƒbƒg‚S‚Q†‹@‚ɂ‚ĸ‚āAŽO•HdH‚́AĄŒŽ‚Q‚O“ú‚ĖŒß‘O‚UŽž‚T‚W•Ē‚ÉŽ­Ž™“‡Œ§‚ĖŽíŽq“‡‰F’ˆƒZƒ“ƒ^[‚Š‚į‘Å‚ŋã‚°‚邯”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Œ§“ā@VŒ^ƒRƒƒiV‹KŠ´õŽŌ‚̓[ƒ[17:51] -

    -

    Ž­Ž™“‡Œ§‚ÆŽ­Ž™“‡Žs‚Í‚P‚V“úAV‚ĩ‚­Šm”F‚ŗ‚ę‚ŊVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ĖŠ´õŽŌ‚Í‚ĸ‚ȂЂÁ‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‚i‚qŽ­Ž™“‡–{ü@ė“ā|ŒG”VéŠÔ‚ʼn^“]ÄŠJ[16:29] -

    -

    ‘å‰J‚ˉe‹ŋ‚ʼn^“]‚đŒŠ‡‚킚‚Ä‚ĸ‚Ŋ‚i‚qŽ­Ž™“‡–{ü‚ːė“ā[ŒG”Vé‚ĖŠÔ‚ÍAĄŒŽ‚Q‚O“ú‚Š‚įˆę•”‚ʼn^“]‚đÄŠJ‚ĩ‚Ü‚ˇB

    -
    -
  • -
  • -

    ‰Ž‹v“‡’Ŧo’Ŗ—ˇ”ī–â‘č@‘O‹c’ˇ‚đŧ‹\‚Ė‹^‚ĸ‚ÅŒYŽ–”­‚Ö[16:06] -

    -

    Ž­Ž™“‡Œ§‰Ž‹v“‡’Ŧ‚Ė‘O‚Ė’Ŧ‹c‰ī‹c’ˇ‚Ė’jĢ‚ǁAo’Ŗ—ˇ”ī‚đ•sŗ‚ÉŽķ‚¯Žæ‚Á‚Ä‚ĸ‚Ŋ‚Æ‚ĩ‚āAZ–¯‚į‚Ēŧ‹\‚Ė‹^‚ĸ‚ŋ߂­ŒYŽ–”­‚ˇ‚él‚Ļ‚đŽĻ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŽF–€ė“āŽs‚Ė•ļ‰ģƒz[ƒ‹Õ’n—˜—p@‹ã“d’ņˆÄ‚ĖŽ{ŨŒšŨˆÄ‚đĖ—p[16:05] -

    -

    —ˆ”Nt‚É•ÂŠŲ‚ˇ‚éŽ­Ž™“‡Œ§ŽF–€ė“āŽs‚ːė“ā•ļ‰ģƒz[ƒ‹‚ːՒn‚ɂ‚ĸ‚āAŽs‚Í‹ãB“d—Í‚Ē’ņˆÄ‚ĩ‚ŊV‚Ŋ‚ČŽ{Ũ‚ĖŒšŨˆÄ‚đĖ—p‚ĩAĄŒã‹Ļ‹c‚đi‚ß‚é•ûj‚Å‚ˇB

    -
    -
  • -
  • -

    u‚r‚c‚f‚“v‚ĖˆęŠÂ‚ŏŦŒ^“d‹CŽŠ“ŽŽÔ‚đ“ą“ü@Ž­Ž™“‡‘ŠŒŨM—p‹āŒÉ[16:00] -

    -

    Ž­Ž™“‡‘ŠŒŨM—p‹āŒÉ‚Ē‚r‚c‚f‚“uŽ‘ą‰Â”\‚ȎЉī‚đė‚銈“ށv‚ĖˆęŠÂ‚Æ‚ĩ‚āAˆęlæ‚č‚ĖŦŒ^“d‹CŽŠ“ŽŽÔ‚đ“ą“ü‚ĩ‚P‚V“úAo”­ŽŽ‚Ēs‚í‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŒF–{‚Ők“x‚R‚Ė’nk@Ž­Ž™“‡Œ§’ˇ“‡’Ŧ‚Ők“x‚P[15:07] -

    -

    ‚P‚V“úŒßŒã‚QŽž‚T‚S•Ē‚˛‚ëŒF–{Œ§ŒF–{’n•û‚đkŒš’n‚Æ‚ˇ‚é’nk‚Ē‚ ‚čAŒF–{Œ§‚ōőåk“x‚R‚đŠĪ‘Ē‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ’čŠúŒŸ¸’†‚ĖŽ­Ž™“‡Eė“āŒ´”­‚P†‹@‚ŋȂǂÁ‚Ŋ§Œä–_Šm”F[11:56] -

    -

    ’čŠúŒŸ¸’†‚ĖŽ­Ž™“‡Œ§‚ːė“āŒ´”­‚P†‹@‚ŁA§Œä–_‚Ė‚¤‚ŋ‚Ė‚P–{‚NjȂǂÁ‚Ä‚ĸ‚é‚Ė‚ĒŠm”F‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Žu•zŽuŽs‚ĖŒ§“š‚T‚P‚R†@’ʍsŽ~‚߉đœ[10:18] -

    -

    Œ§“š‚T‚P‚R†‹{ƒPŒ´‘åčü‚ĖŽ­Ž™“‡Œ§Žu•zŽuŽs—L–ž’ŦŽRd•t‹ß‚ł́AĄŒŽ‚U“ú‚Š‚į“yģ•ö‚ę‚Ė‚Ŋ‚ß’ĘsŽ~‚ß‚Æ‚Č‚Á‚Ä‚ĸ‚Ü‚ĩ‚Ŋ‚ǁA•œ‹Œė‹Æ‚ĒI‚í‚čA‚P‚V“úŒß‘O‚XŽž‚É‰đœ‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰‚”ü’n•û‚Å‚P‚V“ú—Ž—‹‚â“Ë•—‚É’ˆĶ[09:08] -

    -

    ‰‚”ü’n•û‚ł͂P‚V“úA—Ž—‹‚â—ŗŠĒ‚Č‚Į‚ĖŒƒ‚ĩ‚ĸ“Ë•—A‹}‚Č‹­‚ĸ‰J‚É’ˆĶ‚ĩ‚Ä‚­‚ž‚ŗ‚ĸB

    -
    -
  • -

    07ŒŽ16“ú(–Ø)

    -
  • -

    Ž­Ž™“‡Œ§“낺‚‚܎s‚Å”­ŒŠ‚Ėˆâ‘ˁ@s•û•s–ž‚ːV•ˇ”z’Bˆõ‚Ė’jĢ‚ÆŠm”F[22:15] -

    -

    Ž­Ž™“‡Œ§“낺‚‚܎s‚Ė–œ”VŖė‚Ė‰Íė•~‚Å‚P‚S“ú‚ÉŒŠ‚Â‚Š‚Á‚Ŋ’jĢ‚Ėˆâ‘Ė‚ÍAĄŒŽ‚U“ú‚Š‚įs•û‚ǕǂЂį‚Č‚­‚Č‚Á‚Ä‚ĸ‚Ŋ“낺‚‚܎s‚ːV•ˇ”z’Bˆõ‚Ė’jĢ‚ÆŠm”F‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Žs‚ÅŒxŽ@Н‚Č‚Į–ŧæ‚é•sR“d˜b‘ŠŽŸ‚ށ@’ˆĶ‚đ[19:48] -

    -

    Ž­Ž™“‡Žs‚ł͂P‚S“úAŒxŽ@Н‚Č‚Į‚đ–ŧæ‚é•sR‚Č“d˜b‚Ē‘ŠŽŸ‚Ŧ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Q‚Ŋ‚Ģ‚č‚Ė•ęe‚đ‰Ŗ‚Á‚ÄŽ€‚Č‚š‚Ŋ‹^‚ĸ@‚V‚OÎ’ˇ’j‚đ‘ߕ߁@Ž­Ž™“‡Œ§’m–ŧ’Ŧ[19:23] -

    -

    Ž­Ž™“‡Œ§‰Ģ‰i—Į•”“‡‚Ė’m–ŧ’Ŧ‚ŁAQ‚Ŋ‚Ģ‚č‚Ė•ęe‚đ‰Ŗ‚Á‚ÄŽ€–S‚ŗ‚š‚Ŋ‚Æ‚ĩ‚āA“¯‹‚ˇ‚é‚V‚OÎ‚Ė’ˇ’j‚ĒŠQ’vŽ€‚Ė‹^‚ĸ‚őߕ߂ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ’ˇ‰J‚Å“úÆ•s‘́@•Ŋ”N‚Ė‚PŠ„–ĸ–ž‚ā@Ž­Ž™“‡Œ§“ā‚ĖÁ”ī‚ɉe‹ŋ[19:22] -

    -

    ”~‰J‚Ė’ˇ‰J‚ˉe‹ŋ‚ŁAŽ­Ž™“‡Œ§‚Ė“ú’uŽs‚âŽF–€ė“āŽs‚ł́A‚ą‚Ė‚P‚O“úŠÔ‚Ė“úÆŽžŠÔ‚Ē•Ŋ”N‚Ė‚PŠ„‚É‚ā–ž‚Ŋ‚Č‚ĸ‚Č‚ĮA“úÆ•s‘̂Ǒą‚ĸ‚Ä‚ĸ‚Ü‚ˇB

    -
    -
  • -
  • -

    ‹L˜^“I‘å‰J‚ĖŽ­Ž™“‡Œ§“ā@Še’n‚Å•œ‹Œė‹Æ‘ą‚­[19:22] -

    -

    Ž­Ž™“‡Œ§‚Ė‘å‹÷’n•û‚ł́AĄŒŽ‚U“ú‚ÉŠĪ‘ĒŽjãÅ‘å‚ĖŽžŠÔ‰J—Ę‚P‚O‚XE‚Tƒ~ƒŠ‚đŠĪ‘Ē‚ˇ‚é‚Č‚ĮA‹L˜^“I‚Č‘å‰J‚Æ‚Č‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒiV‚Ŋ‚É‚SlŠ´õŠm”F@Ž­Ž™“‡Œ§“ā‚ĖŠ´õŽŌ‚Í‚P‚U‚Ql‚É[19:21] -

    -

    Ž­Ž™“‡Œ§“ā‚ł́A‚Sl‚ːVŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚Ö‚ĖŠ´õ‚ǐV‚Ŋ‚ÉŠm”F‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    VŒ^ƒRƒƒih”‘—×{Ž{Ũ‚Ɂ@Ž­Ž™“‡Œ§‚ǐV‚Ŋ‚Ƀzƒeƒ‹‚đŽØ‚čã‚°[19:20] -

    -

    VŒ^ƒRƒƒi‚ĖŠ´õŠm”F‚Ē‘‰Á‚ˇ‚é’†AŽ­Ž™“‡Œ§‚ÍŒyĮ‚â–ŗĮķ‚ĖŠ´õŽŌ‚Č‚Į‚ɑ؍Ũ‚ĩ‚Ä‚ā‚Ⴄ‚Ŋ‚߂ɁAV‚Ŋ‚ÉŽ­Ž™“‡Žs“ā‚Ėƒzƒeƒ‹‚P“‚đŽØ‚čã‚°‚Ŋ‚Æ”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ŽŠ–¯“}Ž­Ž™“‡Œ§‹c’c@’mŽ–‘I‘Š‡‚ˉī‹c@uŒ‹˜_Ž‚ŋ‰z‚ĩv[19:19] -

    -

    ‚P‚Q“ú‚ɓЕ[‚Ēs‚í‚ę‚ŊŽ­Ž™“‡Œ§’mŽ–‘I‹“‚ŁA„‘E‚ĩ‚ŊŒģEŒķ•â‚Ē”s‚ę‚Ŋ‚ą‚Æ‚đŽķ‚¯‚āAŽŠ–¯“}Œ§‹c’c‚Í‚P‚U“úA‘Š‡‚ˇ‚é‰ī‹c‚đŠJ‚̂܂ĩ‚Ŋ‚ǁAŒ‹˜_‚ÍŽ‚ŋ‰z‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Œ§‹c‰ī‹cˆõ•⌇‘I‹“@“–‘I‚˒߉’^˛•F‚ŗ‚ņ‚Ē‰“o’Ą[16:21] -

    -

    ĄŒŽ‚P‚Q“ú‚É“ŠŠJ•[‚Ēs‚í‚ę‚ŊŽ­Ž™“‡Œ§‹c‰ī‹cˆõŽF–€ė“āŽs‹æ‚Ė•âŒ‡‘I‹“‚Å“–‘I‚ĩ‚Ŋ’߉’^˛•F‚ŗ‚ņ‚Ē‚P‚U“úA‰“o’Ą‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    uŽ­Ž™“‡Žs‚ːíĐ‚Æ•œ‹ģŽĘ^“WvŽn‚Ü‚é@’ˇč‚ĖŒ´”š”íŠQ‚Ėƒpƒlƒ‹‚ā[16:21] -

    -

    Ž­Ž™“‡Žs–đŠ‚ÅAŽ­Ž™“‡‚Æ’ˇč‚Ėí‘ˆ”íŠQ‚Æ•œ‹ģ‚Ė•ā‚Ũ‚đŽû‚ß‚ŊŽĘ^“W‚Ē‚P‚U“ú‚Š‚įŽn‚Ü‚č‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ˆĸ‹vĒŽs‚Ė–Ŗ—Í‚Ē‹l‚Ü‚Á‚Ŋu‚¨h@‚Ũ‚Į‚ą‚ĸvƒI[ƒvƒ“[16:20] -

    -

    Ž­Ž™“‡Œ§ˆĸ‹vĒŽs‚Ė–Ŗ—Í‚Ē‹l‚Ü‚Á‚Ŋh”‘Ž{Ũu‚¨h@‚Ũ‚Į‚ą‚ĸv‚ĒƒI[ƒvƒ“‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰Ž‹v“‡’ŦEr–؍kŽĄ’Ŧ’ˇ‚đŧ‹\‚Č‚Į‚Ė‹^‚ĸ‚ŏ‘—Ū‘—ŒŸ@—ˇ”ī’…•ž–â‘č[16:00] -

    -

    ‰Ž‹v“‡’Ŧ‚Ėr–؍kŽĄ’Ŧ’ˇ‚Ēo’Ŗ—ˇ”ī‚Ėˆę•”‚đ’…•ž‚ĩ‚Ä‚ĸ‚Ŋ–â‘č‚đ„‚čAŽ­Ž™“‡Œ§Œx‚Í‚P‚U“úAr–؍kŽĄ’Ŧ’ˇ‚đŧ‹\‚Č‚Į‚Ė‹^‚ĸ‚ŏ‘—Ū‘—ŒŸ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­Ž™“‡Œ§“ā‚ːVŒ^ƒRƒƒiŠ´õŽŌŠg‘å‚đŽķ‚¯@åŠŪ‰€‚Ē‹x‹ÆŠúŠÔ‚đ‰„’ˇ[11:56] -

    -

    VŒ^ƒRƒƒiƒEƒCƒ‹ƒX‚ˉe‹ŋ‚ōĄ”N‚SŒŽ‚Š‚į‹x‹Æ‚ĩ‚Ä‚ĸ‚éŽ­Ž™“‡Žs‚ˁuåŠŪ‰€v‚́A‚P‚V“ú‚Š‚į‰c‹Æ‚đÄŠJ‚ˇ‚é—\’č‚Å‚ĩ‚Ŋ‚ǁAĄŒŽ‚É“ü‚čAŒ§“ā‚ÅŠ´õŽŌ‚Ē‘‚ςĂĸ‚邹‚Æ‚đŽķ‚¯A‹x‹ÆŠúŠÔ‚đ‰„’ˇ‚ˇ‚邯”­•\‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    Ž­‰ŽŽs‚Ė‘“š‚Q‚Q‚O†ŒÃ]ƒoƒCƒpƒX@’ʍsÄŠJ[09:16] -

    -

    ‘“š‚Q‚Q‚O†ŒÃ]ƒoƒCƒpƒX‚ĖŽ­‰ŽŽs‚ĖĒ–ØŒ´Œđˇ“_‚Ɛ‚…Žs‚˂܂ŗ‚Š‚čŒđˇ“_‚ĖŠÔ‚Å‚ÍAĄŒŽ‚U“ú‚Š‚į“yģ‚Ė—Ŧޏ‚Ė•œ‹Œė‹Æ‚Ė‚Ŋ‚ß’ĘsŽ~‚ß‚Æ‚Č‚Á‚Ä‚ĸ‚Ü‚ĩ‚Ŋ‚ǁA‚P‚U“úŒß‘O‚UŽž‚ɁA‹K§‚Í‰đœ‚ŗ‚ę‚Ü‚ĩ‚ŊB

    -
    -
  • -
  • -

    ‰‚”ü’n•û‚Å‚P‚V“ú‚ɂЂ¯‚Ä—Ž—‹‚â“Ë•—‚É’ˆĶ[08:30] -

    -

    ‰‚”ü’n•û‚Å‚P‚V“ú‚ɂЂ¯‚Ä—Ž—‹‚â—ŗŠĒ‚Č‚Į‚ĖŒƒ‚ĩ“Ë•—A‹}‚Č‹­‚ĸ‰J‚É’ˆĶ‚ĩ‚Ä‚­‚ž‚ŗ‚ĸB

    -
    -
  • -
  • -

    z–K”VŖ“‡‚Ŕ𔭓I•Ŧ‰Î[08:17] -

    -

    \“‡‘ē‚ːz–K”VŖ“‡‚Å‚P‚U“ú’ЁA”š”­“I•Ŧ‰Î‚Ē”­ļ‚ĩ‚Ü‚ĩ‚ŊB

    -
    -
  • - - -
    - - - -
    -
    -
    -
    -
    - -
    - -
    -
    - -
    -
    - -
    - -
    -
    -
    -
    -
    -
    - - -
    -
    -
    - -
    - -
    -
    -
    -
    - - - - -
    Copyright(c) Minaminihon Broadcasting Co.,Ltd. All rights reserved.
    - ŒfÚ‚ŗ‚ę‚Ŋ‘S‚ĂˋLŽ–E‰æ‘œ“™‚Ė–ŗ’f“]ÚA“ņŽŸ—˜—p‚đ‚¨’f‚č‚ĸ‚Ŋ‚ĩ‚Ü‚ˇB
    - - - - - diff --git a/tests/mock_server/templates/title_og_with_html.com.html b/tests/mock_server/templates/title_og_with_html.com.html deleted file mode 100644 index 6c5688c7ec..0000000000 --- a/tests/mock_server/templates/title_og_with_html.com.html +++ /dev/null @@ -1,698 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - Skip to content -

    - 24 ways - to impress your friends - -

    -
    -
    - - - -
    - - -
    -
    -
    -

    It All Starts with a Humble <textarea>

    - -
    - -
    -
      -
    • - -
    • - - -
    • Published in - UX -
    • - - -
    • - No comments -
    • -
    -
    - -
    - -
    -

    Those that know me well know that I make - a lot - of - side projects. I most definitely make too many, but there’s one really useful thing about making lots of side projects: it allows me to experiment in a low-risk setting. -

    -

    Side projects also allow me to accidentally create a context where I can demonstrate a really affective, long-running methodology for building on the web: - progressive enhancement. That context is a little Progressive Web App that I’m tinkering with called - Jotter. It’s incredibly simple, but under the hood, there’s a really solid experience built on top of a - minimum viable experience - which after reading this article, you’ll hopefully apply this methodology to your own work.

    -
    - The Jotter Progressive Web App presented in the Google Chrome browser. - -
    -

    What is a minimum viable experience?

    -

    The key to progressive enhancement is distilling the user experience to its lowest possible technical solution and then building on it to improve the user experience. In the context of - Jotter, that is a humble - <textarea> - element. That humble - <textarea> - is our - minimum viable experience. -

    -

    Let me show you how it’s built up, progressively real quick. If you disable CSS and JavaScript, you get this:

    -
    - The Jotter Progressive Web App with CSS and JavaScript disabled shows a HTML only experience. - -
    -

    This result is great because I know that regardless of what happens, the user can do what they needed to do when the loaded Jotter in their browser: take some notes. That’s our - minimum viable experience, completed with a few lines of code that work in - every single browser—even very old browsers. Don’t you just love good ol’ HTML? -

    -

    Now it’s time to enhance that minimum viable experience, - progressively. It’s a good idea to do that in smaller steps rather than just provide a 0% experience or a 100% experience, which is the approach that’s often favoured by JavaScript framework enthusiasts. I think that process is counter-intuitive to the web, though, so building up from a minimum viable experience is the optimal way to go, in my opinion. -

    -

    Understanding how a - minimum viable experience - works can be a bit tough, admittedly, so I like to use a the following diagram to explain the process:

    -
    - Minimum viable experience diagram which is described in the next paragraph. - -
    -

    Let me break down this diagram for both folks who can and can’t see it. On the top row, there’s four stages of a broken-up car, starting with just a wheel, all the way up to a fully functioning car. The car enhances only in a way that it is still - mostly useless - until it gets to its final form when the person is finally happy. -

    -

    On the second row, instead of building a car, we start with a skateboard which immediately does the job of getting the person from point A to point B. This enhances to a Micro Scooter and then to a Push Bike. Its final form is a fancy looking Motor Scooter. I choose that instead of a car deliberately because generally, when you progressively enhance a project, it turns out to be - way simpler and lighter - than a project that was built without progressive enhancement in mind.

    -

    Now that we know what a minimum viable experience is and how it works, let’s apply this methodology to Jotter! -

    -

    Add some CSS

    -

    The first enhancement is CSS. Jotter has a very simple design, which is mostly a full height - <textarea> - with a little sidebar. A flexbox-based, auto-stacking layout, inspired by a layout called - The Sidebar - is used and we’re good to go. -

    -

    Based on the diagram from earlier, we can comfortably say we’re in - Skateboard - territory now.

    -

    Add some JavaScript

    -

    We’ve got styles now, so let’s - enhance - the experience again. A user can currently load up the site and take notes. If the CSS loads, it’ll be a more pleasant experience, but if they refresh their browser, they’re going to lose all of their work.

    -

    We can fix that by adding some - local storage - into the mix. -

    -

    The functionality flow is pretty straightforward. As a user inputs content, the JavaScript listens to an - input - event and pushes the content of the - <textarea> - into - localStorage. If we then set that - localStorage - data to populate the - <textarea> - on load, that user’s experience is suddenly - enhanced - because they can’t lose their work by accidentally refreshing. -

    -

    The JavaScript is incredibly light, too: -

    -
    const textArea = document.querySelector('textarea');
    -const storageKey = 'text';
    -
    -const init = () => {
    -
    -  textArea.value = localStorage.getItem(storageKey);
    -
    -  textArea.addEventListener('input', () => {
    -    localStorage.setItem(storageKey, textArea.value);
    -  });
    -}
    -
    -init();
    -

    In around 13 lines of code (which you can see a - working demo here), we’ve been able to enhance the user’s experience - considerably, and if we think back to our diagram from earlier, we are very much in - Micro Scooter - territory now. -

    -

    Making it a PWA

    -

    We’re in really good shape now, so let’s turn Jotter into a - Motor Scooter - and make this thing work offline as an installable Progressive Web App (PWA). -

    -

    Making a PWA is really achievable and Google have even produced a - handy checklist - to help you get going. You can also get guidance from a - Lighthouse audit. -

    -

    For this little app, all we need is a - manifest - and a - Service Worker - to cache assets and serve them offline for us if needed.

    -

    The Service Worker is actually pretty slim, so here it is in its entirety: -

    -
    const VERSION = '0.1.3';
    -const CACHE_KEYS = {
    -  MAIN: `main-${VERSION}`
    -};
    -
    -// URLS that we want to be cached when the worker is installed
    -const PRE_CACHE_URLS = ['/', '/css/global.css', '/js/app.js', '/js/components/content.js'];
    -
    -/**
    - * Takes an array of strings and puts them in a named cache store
    - *
    - * @param {String} cacheName
    - * @param {Array} items=[]
    - */
    -const addItemsToCache = function(cacheName, items = []) {
    -  caches.open(cacheName).then(cache => cache.addAll(items));
    -};
    -
    -self.addEventListener('install', evt => {
    -  self.skipWaiting();
    -
    -  addItemsToCache(CACHE_KEYS.MAIN, PRE_CACHE_URLS);
    -});
    -
    -self.addEventListener('activate', evt => {
    -  // Look for any old caches that don't match our set and clear them out
    -  evt.waitUntil(
    -    caches
    -      .keys()
    -      .then(cacheNames => {
    -        return cacheNames.filter(item => !Object.values(CACHE_KEYS).includes(item));
    -      })
    -      .then(itemsToDelete => {
    -        return Promise.all(
    -          itemsToDelete.map(item => {
    -            return caches.delete(item);
    -          })
    -        );
    -      })
    -      .then(() => self.clients.claim())
    -  );
    -});
    -
    -self.addEventListener('fetch', evt => {
    -  evt.respondWith(
    -    caches.match(evt.request).then(cachedResponse => {
    -      // Item found in cache so return
    -      if (cachedResponse) {
    -        return cachedResponse;
    -      }
    -
    -      // Nothing found so load up the request from the network
    -      return caches.open(CACHE_KEYS.MAIN).then(cache => {
    -        return fetch(evt.request)
    -          .then(response => {
    -            // Put the new response in cache and return it
    -            return cache.put(evt.request, response.clone()).then(() => {
    -              return response;
    -            });
    -          })
    -          .catch(ex => {
    -            return;
    -          });
    -      });
    -    })
    -  );
    -});
    -

    What the Service Worker does here is pre-cache our core assets that we define in PRE_CACHE_URLS. Then, for each fetch event which is called per request, it’ll try to fulfil the request from cache first. If it can’t do that, it’ll load the remote request for us. With this setup, we achieve two things:

    -
      -
    1. We get offline support because we stick our critical assets in cache immediately so they will be accessible offline
    2. -
    3. Once those critical assets and any other requested assets are cached, the app will run faster by default
    4. -
    -

    Importantly now, because we have a manifest, some shortcut icons and a Service Worker that gives us offline support, we have a fully installable PWA!

    -

    Wrapping up

    -

    I hope with this simplified example you can see how approaching web design and development with a progressive enhancement approach, everyone gets an acceptable experience instead of those who are lucky enough to get every aspect of the page at the right time.

    -

    Jotter is very much live and in the process of being enhanced further, which you can see on its little in-app roadmap, so go ahead and play around with it.

    -

    Before you know it, it’ll be a car itself, but remember: it’ll always start as a humble little <textarea>.

    -
    -
    - -
    -
    -

    About the author

    -
    -
    -
    - -

    Andy Bell is an independent designer and front-end developer who’s trying to make everyone’s experience on the web better with a focus on progressive enhancement and accessibility.

    -

    More articles by Andy

    - -
    -
    -
    - - - - - - - - - - - - - -
    -
    -

    Comments

    -
    - -
    - - - - -
    -
    - diff --git a/tests/mock_server/templates/title_with_html.com.html b/tests/mock_server/templates/title_with_html.com.html deleted file mode 100644 index e84dcaa0a1..0000000000 --- a/tests/mock_server/templates/title_with_html.com.html +++ /dev/null @@ -1,699 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - It All Starts with a Humble <textarea> ◆ 24 ways - - -
    - Skip to content -

    - 24 ways - to impress your friends - -

    -
    -
    - - - -
    - - -
    -
    -
    -

    It All Starts with a Humble <textarea>

    - -
    - -
    -
      -
    • - -
    • - - -
    • Published in - UX -
    • - - -
    • - No comments -
    • -
    -
    - -
    - -
    -

    Those that know me well know that I make - a lot - of - side projects. I most definitely make too many, but there’s one really useful thing about making lots of side projects: it allows me to experiment in a low-risk setting. -

    -

    Side projects also allow me to accidentally create a context where I can demonstrate a really affective, long-running methodology for building on the web: - progressive enhancement. That context is a little Progressive Web App that I’m tinkering with called - Jotter. It’s incredibly simple, but under the hood, there’s a really solid experience built on top of a - minimum viable experience - which after reading this article, you’ll hopefully apply this methodology to your own work.

    -
    - The Jotter Progressive Web App presented in the Google Chrome browser. - -
    -

    What is a minimum viable experience?

    -

    The key to progressive enhancement is distilling the user experience to its lowest possible technical solution and then building on it to improve the user experience. In the context of - Jotter, that is a humble - <textarea> - element. That humble - <textarea> - is our - minimum viable experience. -

    -

    Let me show you how it’s built up, progressively real quick. If you disable CSS and JavaScript, you get this:

    -
    - The Jotter Progressive Web App with CSS and JavaScript disabled shows a HTML only experience. - -
    -

    This result is great because I know that regardless of what happens, the user can do what they needed to do when the loaded Jotter in their browser: take some notes. That’s our - minimum viable experience, completed with a few lines of code that work in - every single browser—even very old browsers. Don’t you just love good ol’ HTML? -

    -

    Now it’s time to enhance that minimum viable experience, - progressively. It’s a good idea to do that in smaller steps rather than just provide a 0% experience or a 100% experience, which is the approach that’s often favoured by JavaScript framework enthusiasts. I think that process is counter-intuitive to the web, though, so building up from a minimum viable experience is the optimal way to go, in my opinion. -

    -

    Understanding how a - minimum viable experience - works can be a bit tough, admittedly, so I like to use a the following diagram to explain the process:

    -
    - Minimum viable experience diagram which is described in the next paragraph. - -
    -

    Let me break down this diagram for both folks who can and can’t see it. On the top row, there’s four stages of a broken-up car, starting with just a wheel, all the way up to a fully functioning car. The car enhances only in a way that it is still - mostly useless - until it gets to its final form when the person is finally happy. -

    -

    On the second row, instead of building a car, we start with a skateboard which immediately does the job of getting the person from point A to point B. This enhances to a Micro Scooter and then to a Push Bike. Its final form is a fancy looking Motor Scooter. I choose that instead of a car deliberately because generally, when you progressively enhance a project, it turns out to be - way simpler and lighter - than a project that was built without progressive enhancement in mind.

    -

    Now that we know what a minimum viable experience is and how it works, let’s apply this methodology to Jotter! -

    -

    Add some CSS

    -

    The first enhancement is CSS. Jotter has a very simple design, which is mostly a full height - <textarea> - with a little sidebar. A flexbox-based, auto-stacking layout, inspired by a layout called - The Sidebar - is used and we’re good to go. -

    -

    Based on the diagram from earlier, we can comfortably say we’re in - Skateboard - territory now.

    -

    Add some JavaScript

    -

    We’ve got styles now, so let’s - enhance - the experience again. A user can currently load up the site and take notes. If the CSS loads, it’ll be a more pleasant experience, but if they refresh their browser, they’re going to lose all of their work.

    -

    We can fix that by adding some - local storage - into the mix. -

    -

    The functionality flow is pretty straightforward. As a user inputs content, the JavaScript listens to an - input - event and pushes the content of the - <textarea> - into - localStorage. If we then set that - localStorage - data to populate the - <textarea> - on load, that user’s experience is suddenly - enhanced - because they can’t lose their work by accidentally refreshing. -

    -

    The JavaScript is incredibly light, too: -

    -
    const textArea = document.querySelector('textarea');
    -const storageKey = 'text';
    -
    -const init = () => {
    -
    -  textArea.value = localStorage.getItem(storageKey);
    -
    -  textArea.addEventListener('input', () => {
    -    localStorage.setItem(storageKey, textArea.value);
    -  });
    -}
    -
    -init();
    -

    In around 13 lines of code (which you can see a - working demo here), we’ve been able to enhance the user’s experience - considerably, and if we think back to our diagram from earlier, we are very much in - Micro Scooter - territory now. -

    -

    Making it a PWA

    -

    We’re in really good shape now, so let’s turn Jotter into a - Motor Scooter - and make this thing work offline as an installable Progressive Web App (PWA). -

    -

    Making a PWA is really achievable and Google have even produced a - handy checklist - to help you get going. You can also get guidance from a - Lighthouse audit. -

    -

    For this little app, all we need is a - manifest - and a - Service Worker - to cache assets and serve them offline for us if needed.

    -

    The Service Worker is actually pretty slim, so here it is in its entirety: -

    -
    const VERSION = '0.1.3';
    -const CACHE_KEYS = {
    -  MAIN: `main-${VERSION}`
    -};
    -
    -// URLS that we want to be cached when the worker is installed
    -const PRE_CACHE_URLS = ['/', '/css/global.css', '/js/app.js', '/js/components/content.js'];
    -
    -/**
    - * Takes an array of strings and puts them in a named cache store
    - *
    - * @param {String} cacheName
    - * @param {Array} items=[]
    - */
    -const addItemsToCache = function(cacheName, items = []) {
    -  caches.open(cacheName).then(cache => cache.addAll(items));
    -};
    -
    -self.addEventListener('install', evt => {
    -  self.skipWaiting();
    -
    -  addItemsToCache(CACHE_KEYS.MAIN, PRE_CACHE_URLS);
    -});
    -
    -self.addEventListener('activate', evt => {
    -  // Look for any old caches that don't match our set and clear them out
    -  evt.waitUntil(
    -    caches
    -      .keys()
    -      .then(cacheNames => {
    -        return cacheNames.filter(item => !Object.values(CACHE_KEYS).includes(item));
    -      })
    -      .then(itemsToDelete => {
    -        return Promise.all(
    -          itemsToDelete.map(item => {
    -            return caches.delete(item);
    -          })
    -        );
    -      })
    -      .then(() => self.clients.claim())
    -  );
    -});
    -
    -self.addEventListener('fetch', evt => {
    -  evt.respondWith(
    -    caches.match(evt.request).then(cachedResponse => {
    -      // Item found in cache so return
    -      if (cachedResponse) {
    -        return cachedResponse;
    -      }
    -
    -      // Nothing found so load up the request from the network
    -      return caches.open(CACHE_KEYS.MAIN).then(cache => {
    -        return fetch(evt.request)
    -          .then(response => {
    -            // Put the new response in cache and return it
    -            return cache.put(evt.request, response.clone()).then(() => {
    -              return response;
    -            });
    -          })
    -          .catch(ex => {
    -            return;
    -          });
    -      });
    -    })
    -  );
    -});
    -

    What the Service Worker does here is pre-cache our core assets that we define in PRE_CACHE_URLS. Then, for each fetch event which is called per request, it’ll try to fulfil the request from cache first. If it can’t do that, it’ll load the remote request for us. With this setup, we achieve two things:

    -
      -
    1. We get offline support because we stick our critical assets in cache immediately so they will be accessible offline
    2. -
    3. Once those critical assets and any other requested assets are cached, the app will run faster by default
    4. -
    -

    Importantly now, because we have a manifest, some shortcut icons and a Service Worker that gives us offline support, we have a fully installable PWA!

    -

    Wrapping up

    -

    I hope with this simplified example you can see how approaching web design and development with a progressive enhancement approach, everyone gets an acceptable experience instead of those who are lucky enough to get every aspect of the page at the right time.

    -

    Jotter is very much live and in the process of being enhanced further, which you can see on its little in-app roadmap, so go ahead and play around with it.

    -

    Before you know it, it’ll be a car itself, but remember: it’ll always start as a humble little <textarea>.

    -
    -
    - -
    -
    -

    About the author

    -
    -
    -
    - -

    Andy Bell is an independent designer and front-end developer who’s trying to make everyone’s experience on the web better with a focus on progressive enhancement and accessibility.

    -

    More articles by Andy

    - -
    -
    -
    - - - - - - - - - - - - - -
    -
    -

    Comments

    -
    - -
    - - - - -
    -
    - diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3 deleted file mode 100755 index 04d35a71e6..0000000000 Binary files a/tests/tags_migration/index.sqlite3 and /dev/null differ diff --git a/tests/test_add.py b/tests/test_add.py deleted file mode 100644 index 331178fe05..0000000000 --- a/tests/test_add.py +++ /dev/null @@ -1,93 +0,0 @@ -import subprocess -import json -import sqlite3 - -from .fixtures import * - -def test_depth_flag_is_accepted(process, disable_extractors_dict): - arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], - capture_output=True, env=disable_extractors_dict) - assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") - - -def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict): - arg_process = subprocess.run( - ["archivebox", "add", "--depth=5", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - assert 'invalid choice' in arg_process.stderr.decode("utf-8") - arg_process = subprocess.run( - ["archivebox", "add", "--depth=-1", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - assert 'invalid choice' in arg_process.stderr.decode("utf-8") - - -def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extractors_dict): - arg_process = subprocess.run( - ["archivebox", "add", "--depth=0", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - with open(archived_item_path / "index.json", "r", encoding='utf-8') as f: - output_json = json.load(f) - assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" - - -def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extractors_dict): - arg_process = subprocess.run( - ["archivebox", "add", "--depth=1", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - urls = c.execute("SELECT url from core_snapshot").fetchall() - conn.commit() - conn.close() - - urls = list(map(lambda x: x[0], urls)) - assert "http://127.0.0.1:8080/static/example.com.html" in urls - assert "http://127.0.0.1:8080/static/iana.org.html" in urls - - -def test_overwrite_flag_is_accepted(process, disable_extractors_dict): - subprocess.run( - ["archivebox", "add", "--depth=0", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - arg_process = subprocess.run( - ["archivebox", "add", "--overwrite", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - assert 'unrecognized arguments: --overwrite' not in arg_process.stderr.decode("utf-8") - assert 'favicon' in arg_process.stdout.decode('utf-8'), 'archive methods probably didnt run, did overwrite work?' - -def test_add_updates_history_json_index(tmp_path, process, disable_extractors_dict): - subprocess.run( - ["archivebox", "add", "--depth=0", "http://127.0.0.1:8080/static/example.com.html"], - capture_output=True, - env=disable_extractors_dict, - ) - - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - - with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: - output_json = json.load(f) - assert output_json["history"] != {} - -def test_extract_input_uses_only_passed_extractors(tmp_path, process): - subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"], - capture_output=True) - - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - - assert (archived_item_path / "warc").exists() - assert not (archived_item_path / "singlefile.html").exists() diff --git a/tests/test_extractors.py b/tests/test_extractors.py deleted file mode 100644 index 86b50d51c8..0000000000 --- a/tests/test_extractors.py +++ /dev/null @@ -1,115 +0,0 @@ -from .fixtures import * -import json as pyjson -from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title - -def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_WGET": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") - -def test_ignore_methods(): - """ - Takes the passed method out of the default methods list and returns that value - """ - ignored = ignore_methods(['title']) - assert should_save_title not in ignored - -def test_singlefile_works(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - output_file = archived_item_path / "singlefile.html" - assert output_file.exists() - -def test_readability_works(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_READABILITY": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "readability" / "content.html" - assert output_file.exists() - -def test_mercury_works(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_MERCURY": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "mercury" / "content.html" - assert output_file.exists() - -def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "readability" / "content.html" - assert output_file.exists() - -def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "readability" / "content.html" - assert output_file.exists() - -def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "readability" / "content.html" - assert output_file.exists() - -def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - output_str = add_process.stdout.decode("utf-8") - assert "> singlefile" not in output_str - assert "> readability" not in output_str - -def test_headers_ignored(tmp_path, process, disable_extractors_dict): - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "headers.json" - assert not output_file.exists() - -def test_headers_retrieved(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"SAVE_HEADERS": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "headers.json" - assert output_file.exists() - headers_file = archived_item_path / 'headers.json' - with open(headers_file, 'r', encoding='utf-8') as f: - headers = pyjson.load(f) - assert headers['Content-Language'] == 'en' - assert headers['Content-Script-Type'] == 'text/javascript' - assert headers['Content-Style-Type'] == 'text/css' - -def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"SAVE_HEADERS": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "headers.json" - with open(output_file, 'r', encoding='utf-8') as f: - headers = pyjson.load(f) - assert headers['Content-Language'] == 'en' - assert headers['Content-Script-Type'] == 'text/javascript' - assert headers['Content-Style-Type'] == 'text/css' - -def test_headers_400_plus(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"SAVE_HEADERS": "true"}) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob("archive/**/*"))[0] - output_file = archived_item_path / "headers.json" - with open(output_file, 'r', encoding='utf-8') as f: - headers = pyjson.load(f) - assert headers["Status-Code"] == "200" diff --git a/tests/test_init.py b/tests/test_init.py deleted file mode 100644 index 728aedfb57..0000000000 --- a/tests/test_init.py +++ /dev/null @@ -1,176 +0,0 @@ -# archivebox init -# archivebox add - -import os -import subprocess -from pathlib import Path -import json, shutil -import sqlite3 - -from archivebox.config import OUTPUT_PERMISSIONS - -from .fixtures import * - -def test_init(tmp_path, process): - assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8") - -def test_update(tmp_path, process): - os.chdir(tmp_path) - update_process = subprocess.run(['archivebox', 'init'], capture_output=True) - assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8") - -def test_add_link(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_WGET": "true"}) - os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], - capture_output=True, env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - - assert "index.json" in [x.name for x in archived_item_path.iterdir()] - - with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: - output_json = json.load(f) - assert "Example Domain" == output_json['history']['title'][0]['output'] - - with open(archived_item_path / "index.html", "r", encoding="utf-8") as f: - output_html = f.read() - assert "Example Domain" in output_html - - -def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): - disable_extractors_dict.update({"USE_WGET": "true"}) - os.chdir(tmp_path) - stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - env=disable_extractors_dict) - stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode()) - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - - assert "index.json" in [x.name for x in archived_item_path.iterdir()] - - with open(archived_item_path / "index.json", "r", encoding="utf-8") as f: - output_json = json.load(f) - assert "Example Domain" == output_json['history']['title'][0]['output'] - -def test_correct_permissions_output_folder(tmp_path, process): - index_files = ['index.sqlite3', 'archive'] - for file in index_files: - file_path = tmp_path / file - assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS - -def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict): - os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, - env=disable_extractors_dict) - archived_item_path = list(tmp_path.glob('archive/**/*'))[0] - for path in archived_item_path.iterdir(): - assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS - -def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict): - os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, - env=disable_extractors_dict) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, - env=disable_extractors_dict) - archive_folders = [x.name for x in (tmp_path / "archive").iterdir()] - - first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders])) - json_index = str(first_archive / "index.json") - with open(json_index, "r", encoding="utf-8") as f: - link_details = json.loads(f.read()) - - link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html" - with open(json_index, "w", encoding="utf-8") as f: - json.dump(link_details, f) - - init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) - # 1 from duplicated url, 1 from corrupted index - assert "Skipped adding 2 invalid link data directories" in init_process.stdout.decode("utf-8") - assert init_process.returncode == 0 - -def test_collision_timestamps_different_urls(tmp_path, process, disable_extractors_dict): - os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, - env=disable_extractors_dict) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, - env=disable_extractors_dict) - archive_folders = [x.name for x in (tmp_path / "archive").iterdir()] - first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders])) - archive_folders.remove(first_archive.name) - json_index = str(first_archive / "index.json") - - with open(json_index, "r", encoding="utf-8") as f: - link_details = json.loads(f.read()) - - link_details["timestamp"] = archive_folders[0] - - with open(json_index, "w", encoding="utf-8") as f: - json.dump(link_details, f) - - init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) - assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8") - assert init_process.returncode == 0 - -def test_orphaned_folders(tmp_path, process, disable_extractors_dict): - os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, - env=disable_extractors_dict) - list_process = subprocess.run(["archivebox", "list", "--json", "--with-headers"], capture_output=True) - with open(tmp_path / "index.json", "wb") as f: - f.write(list_process.stdout) - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - c.execute("DELETE from core_snapshot") - conn.commit() - conn.close() - - init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) - assert "Added 1 orphaned links from existing JSON index" in init_process.stdout.decode("utf-8") - assert init_process.returncode == 0 - -def test_unrecognized_folders(tmp_path, process, disable_extractors_dict): - os.chdir(tmp_path) - subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, - env=disable_extractors_dict) - (tmp_path / "archive" / "some_random_folder").mkdir() - - init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) - assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8") - assert init_process.returncode == 0 - -def test_tags_migration(tmp_path, disable_extractors_dict): - - base_sqlite_path = Path(__file__).parent / 'tags_migration' - - if os.path.exists(tmp_path): - shutil.rmtree(tmp_path) - shutil.copytree(str(base_sqlite_path), tmp_path) - os.chdir(tmp_path) - - conn = sqlite3.connect("index.sqlite3") - conn.row_factory = sqlite3.Row - c = conn.cursor() - c.execute("SELECT id, tags from core_snapshot") - snapshots = c.fetchall() - snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots} - conn.commit() - conn.close() - - init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) - - conn = sqlite3.connect("index.sqlite3") - conn.row_factory = sqlite3.Row - c = conn.cursor() - c.execute(""" - SELECT core_snapshot.id, core_tag.name from core_snapshot - JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id - JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id - """) - tags = c.fetchall() - conn.commit() - conn.close() - - for tag in tags: - snapshot_id = tag["id"] - tag_name = tag["name"] - # Check each tag migrated is in the previous field - assert tag_name in snapshots_dict[snapshot_id] diff --git a/tests/test_list.py b/tests/test_list.py deleted file mode 100644 index a99ed64589..0000000000 --- a/tests/test_list.py +++ /dev/null @@ -1,67 +0,0 @@ -import json - -from .fixtures import * - -def test_list_json(process, disable_extractors_dict): - subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], - capture_output=True, env=disable_extractors_dict) - list_process = subprocess.run(["archivebox", "list", "--json"], capture_output=True) - output_json = json.loads(list_process.stdout.decode("utf-8")) - assert output_json[0]["url"] == "http://127.0.0.1:8080/static/example.com.html" - - -def test_list_json_headers(process, disable_extractors_dict): - subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], - capture_output=True, env=disable_extractors_dict) - list_process = subprocess.run(["archivebox", "list", "--json", "--with-headers"], capture_output=True) - output_json = json.loads(list_process.stdout.decode("utf-8")) - assert output_json["links"][0]["url"] == "http://127.0.0.1:8080/static/example.com.html" - -def test_list_html(process, disable_extractors_dict): - subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], - capture_output=True, env=disable_extractors_dict) - list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True) - output_html = list_process.stdout.decode("utf-8") - assert "