From 499ae3c4f9d7927783d186e1b795dc4486ce8fb9 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 4 Nov 2025 13:43:32 +0100 Subject: [PATCH 1/6] Update installation command for Nix to use nix-shell --- src/content/docs/guides/getting-started.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/content/docs/guides/getting-started.mdx b/src/content/docs/guides/getting-started.mdx index aad6f15..98a2192 100644 --- a/src/content/docs/guides/getting-started.mdx +++ b/src/content/docs/guides/getting-started.mdx @@ -23,7 +23,7 @@ You can install Lychee using various package managers. - + From bb165b14b6878ef73d7dc8b38f53e6155804dc9d Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 4 Nov 2025 16:09:10 +0100 Subject: [PATCH 2/6] Document preprocessor --- astro.config.mjs | 1 + src/content/docs/guides/getting-started.mdx | 21 ++----- src/content/docs/guides/preprocessing.md | 69 +++++++++++++++++++++ 3 files changed, 74 insertions(+), 17 deletions(-) create mode 100644 src/content/docs/guides/preprocessing.md diff --git a/astro.config.mjs b/astro.config.mjs index 4bd10da..99e78a4 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -42,6 +42,7 @@ export default defineConfig({ "guides/config", "guides/cli", "guides/output", + "guides/preprocessing", ], }, { diff --git a/src/content/docs/guides/getting-started.mdx b/src/content/docs/guides/getting-started.mdx index 98a2192..3f51ad9 100644 --- a/src/content/docs/guides/getting-started.mdx +++ b/src/content/docs/guides/getting-started.mdx @@ -206,24 +206,11 @@ In this command, we ignore the case when globbing, so it matches - `~/projects/rust_game_/README` - `~/projects/python_script_/Readme.markdown` -### Check Links From Epub File +### Check other file formats -If you have [atool](https://www.nongnu.org/atool) installed, you can check links inside `.epub` files as well! - -```bash -acat -F zip {file.epub} "_.xhtml" "_.html" | lychee - -``` - -:::caution[Attention] -lychee parses other file formats as plaintext and extracts links using [linkify](https://github.com/robinst/linkify). -This generally works well if there are no format- or encoding -specifics, but in case you need dedicated support for a new file format, please -consider [creating an issue](https://github.com/lycheeverse/lychee/issues). -::: - -[atool]: https://www.nongnu.org/atool -[linkify]: https://github.com/robinst/linkify -[issue]: https://github.com/lycheeverse/lychee/issues +By preprocessing files it is possible to do link checking on +files which aren't officially supported by lychee. +See [file preprocessing](preprocessing). ## GitHub Action diff --git a/src/content/docs/guides/preprocessing.md b/src/content/docs/guides/preprocessing.md new file mode 100644 index 0000000..16a6e05 --- /dev/null +++ b/src/content/docs/guides/preprocessing.md @@ -0,0 +1,69 @@ +--- +title: File preprocessing +--- + +Out of the box lychee supports HTML, Markdown and plain text formats. +More precisely, HTML files are parsed as HTML5 with the use of the [html5ever] parser. +Markdown files are treated as [CommonMark] with the use of [pulldown-cmark]. + +For any other file format lychee falls back to a "plain text" mode. +This means that [linkify] attempts to extract URLs on a best-effort basis. +If invalid UTF-8 characters are encountered, the input file is skipped, +because it is assumed that the file is in a binary format lychee cannot understand. + +lychee allows file preprocessing with the `--preprocess` flag. +For each input file the command specified with `--preprocess` is invoked instead of reading the input file directly. +In the following there are examples how to preprocess common file formats. +In most cases it's necessary to create a helper script for preprocessing, +as no parameters can be supplied from the CLI directly. + +```bash +lychee files/* --preprocess ./preprocess.sh +``` + +The referenced `preprocess.sh` script could look like this: + +```bash +#!/usr/bin/env bash + +case "$1" in +*.pdf) + exec pdftohtml -i -s -stdout "$1" + # Alternatives: + # exec pdftotext "$1" - + # exec pdftk "$1" output - uncompress | grep -aPo '/URI *\(\K[^)]*' + ;; +*.odt|*.docx|*.epub|*.ipynb) + exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx + ;; +*.odp|*.pptx|*.ods|*.xlsx) + # libreoffice can't print to stdout unfortunately + libreoffice --headless --convert-to html "$1" --outdir /tmp + file=$(basename "$1") + file="/tmp/${file%.*}.html" + sed '/ Date: Tue, 4 Nov 2025 16:23:21 +0100 Subject: [PATCH 3/6] Update URL --- src/content/docs/guides/preprocessing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/content/docs/guides/preprocessing.md b/src/content/docs/guides/preprocessing.md index 16a6e05..2cf873a 100644 --- a/src/content/docs/guides/preprocessing.md +++ b/src/content/docs/guides/preprocessing.md @@ -66,4 +66,4 @@ Feel free to open up an issue if you are missing a specific file format or have [html5ever]: https://github.com/servo/html5ever [CommonMark]: https://commonmark.org/ [pulldown-cmark]: https://github.com/pulldown-cmark/pulldown-cmark/ -[lychee-all]: https://github.com/thomas-zahner/lychee-all/ +[lychee-all]: https://github.com/lycheeverse/lychee-all From eb4427507b1d431ecab467ab8a81142291499688 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 6 Nov 2025 22:20:39 +0100 Subject: [PATCH 4/6] Simplify workflow: check all links instead of additions --- .github/workflows/check-pr-links.yml | 37 ++++++---------------------- .lycheeignore | 5 ++-- 2 files changed, 10 insertions(+), 32 deletions(-) diff --git a/.github/workflows/check-pr-links.yml b/.github/workflows/check-pr-links.yml index 98afdfe..a333357 100644 --- a/.github/workflows/check-pr-links.yml +++ b/.github/workflows/check-pr-links.yml @@ -24,44 +24,21 @@ jobs: with: version: latest - - name: Check out master branch - run: git checkout master - - - name: Install dependencies for master - run: pnpm install --frozen-lockfile - - - name: Build site from master - run: pnpm build - - - name: Dump all links from master - id: dump_links_from_master - uses: lycheeverse/lychee-action@v2 - with: - args: '--dump --root-dir ${{ github.workspace }}/dist --exclude-all-private dist' - output: ./links-master.txt - - - name: Stash untracked files - run: git stash push --include-untracked - - - name: Check out feature branch - run: git fetch origin ${{ github.ref }} && git checkout FETCH_HEAD - - - name: Apply stashed changes - run: git stash pop || true - - name: Install dependencies for feature branch run: pnpm install --frozen-lockfile - name: Build site from feature branch run: pnpm build - - name: Append links-master.txt to .lycheeignore - run: cat links-master.txt >> .lycheeignore - - - name: Check links in PR changes + - name: Check links uses: lycheeverse/lychee-action@v2 with: - args: '--root-dir ${{ github.workspace }}/dist --exclude-all-private dist' + # Remap live URLs to build directory because the links are potentially not live (not yet on master) + args: | + --root-dir $PWD/dist + --exclude-all-private + --remap 'https://lychee\.cli\.rs/(.*)/ file://'$PWD'/dist/$1/index.html' + dist fail: true - name: Suggestions diff --git a/.lycheeignore b/.lycheeignore index 1438bff..0f17418 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -2,7 +2,8 @@ https://api.reacher.email/v0/check_email file:///home/user/website/ ^https://www/$ ^https://web/$ -# 404 page returns a 404, d'oh -https://lychee.cli.rs/404/ # Errors with "Too Many Requests" https://www.nongnu.org/atool + +# 404 page is directly in dist/404.html but we've remapped it to an invalid path +dist/404/index.html$ From fdb3c68f36f189d82a15cd0d7b76ee39ab71edf9 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 6 Nov 2025 23:42:02 +0100 Subject: [PATCH 5/6] Link check sources as well --- .github/workflows/check-pr-links.yml | 25 ++++++------------------- .lycheeignore | 4 ++++ src/content/docs/recipes/base-url.mdx | 12 ++++++------ 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/.github/workflows/check-pr-links.yml b/.github/workflows/check-pr-links.yml index a333357..e41bbc8 100644 --- a/.github/workflows/check-pr-links.yml +++ b/.github/workflows/check-pr-links.yml @@ -9,26 +9,12 @@ jobs: linkChecker: runs-on: ubuntu-latest steps: - - name: Clone repository - uses: actions/checkout@v5 - with: - fetch-depth: 0 - - - name: Setup Node.js - uses: actions/setup-node@v6 - with: - node-version: "20" + - uses: actions/checkout@v5 - - name: Setup pnpm - uses: pnpm/action-setup@v4 + - name: Build site + uses: withastro/action@v5 with: - version: latest - - - name: Install dependencies for feature branch - run: pnpm install --frozen-lockfile - - - name: Build site from feature branch - run: pnpm build + package-manager: pnpm@latest - name: Check links uses: lycheeverse/lychee-action@v2 @@ -38,7 +24,8 @@ jobs: --root-dir $PWD/dist --exclude-all-private --remap 'https://lychee\.cli\.rs/(.*)/ file://'$PWD'/dist/$1/index.html' - dist + dist/ + src/ fail: true - name: Suggestions diff --git a/.lycheeignore b/.lycheeignore index 0f17418..775f8fd 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -7,3 +7,7 @@ https://www.nongnu.org/atool # 404 page is directly in dist/404.html but we've remapped it to an invalid path dist/404/index.html$ + +# Code examples in base-url.mdx which don't exist +/docs/about.php$ +/docs/recipes/guide.php$ diff --git a/src/content/docs/recipes/base-url.mdx b/src/content/docs/recipes/base-url.mdx index ea11228..ff6c446 100644 --- a/src/content/docs/recipes/base-url.mdx +++ b/src/content/docs/recipes/base-url.mdx @@ -66,15 +66,15 @@ Here's what happens to different types of links: -Guide -About -External +Guide +About +Absolute -Guide -About -External`} lang={fileLang} +Guide +About +Absolute`} lang={fileLang} title="Link Resolution Example" /> ## Common Use Cases From 209c7aa933890b8faa15e31e209bd618080fe138 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 7 Nov 2025 09:11:00 +0100 Subject: [PATCH 6/6] Fix broken link & update .lycheeignore --- .lycheeignore | 10 +++++++--- src/content/docs/guides/getting-started.mdx | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.lycheeignore b/.lycheeignore index 775f8fd..3646e85 100644 --- a/.lycheeignore +++ b/.lycheeignore @@ -1,9 +1,9 @@ -https://api.reacher.email/v0/check_email file:///home/user/website/ ^https://www/$ ^https://web/$ -# Errors with "Too Many Requests" -https://www.nongnu.org/atool + +# URL is used with POST +https://api.reacher.email/v0/check_email # 404 page is directly in dist/404.html but we've remapped it to an invalid path dist/404/index.html$ @@ -11,3 +11,7 @@ dist/404/index.html$ # Code examples in base-url.mdx which don't exist /docs/about.php$ /docs/recipes/guide.php$ + +# Websites with aggressive rate limiting / bot detection +https://www.nongnu.org/atool +https://builtwith.com/ diff --git a/src/content/docs/guides/getting-started.mdx b/src/content/docs/guides/getting-started.mdx index 3f51ad9..5d5eef9 100644 --- a/src/content/docs/guides/getting-started.mdx +++ b/src/content/docs/guides/getting-started.mdx @@ -210,7 +210,7 @@ In this command, we ignore the case when globbing, so it matches By preprocessing files it is possible to do link checking on files which aren't officially supported by lychee. -See [file preprocessing](preprocessing). +See [file preprocessing](/guides/preprocessing). ## GitHub Action