diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2a7957d..f82d5534 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,33 +2,34 @@ name: CI on: push: - branches: [ '*' ] pull_request: - branches: [ '*' ] - workflow_dispatch: - schedule: - # cron is kinda random, assumes 22:00 UTC is a low ebb, eastern - # countries are very early morning, and US are mid-day to - # mid-afternoon - - cron: '0 22 * * 2' + +permissions: {} jobs: checks: runs-on: ubuntu-latest steps: - name: Checkout working copy - uses: actions/checkout@v4 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + with: + submodules: true + fetch-depth: 0 + persist-credentials: false - name: ruff check - uses: chartboost/ruff-action@v1 + uses: astral-sh/ruff-action@57714a7c8a2e59f32539362ba31877a1957dded1 # 3.5.1 + with: + version: "latest" - name: ruff format if: always() - uses: chartboost/ruff-action@v1 + uses: astral-sh/ruff-action@57714a7c8a2e59f32539362ba31877a1957dded1 # 3.5.1 with: - args: format --diff + version: "latest" + args: format --check --diff - name: Set up Python id: setup_python if: always() - uses: actions/setup-python@v5 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # 6.1.0 with: python-version: "3.x" - name: Install mypy @@ -36,7 +37,7 @@ jobs: if: ${{ always() && steps.setup_python.conclusion == 'success' }} run: | python -mpip install --upgrade pip - python -mpip install mypy types-PyYaml + python -mpip install mypy types-PyYaml ./ua-parser-builtins - name: mypy if: ${{ always() && steps.install_mypy.conclusion == 'success' }} run: mypy @@ -48,11 +49,12 @@ jobs: steps: - name: Checkout working copy - uses: actions/checkout@v4 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 with: submodules: true + persist-credentials: false - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # 6.1.0 with: python-version: "3.x" - name: Install dependency @@ -63,14 +65,14 @@ jobs: run: | python -mbuild - name: Upload sdist - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0 with: name: sdist path: dist/*.tar.gz retention-days: 1 - name: Upload wheel - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0 with: name: wheel path: dist/*.whl @@ -79,7 +81,6 @@ jobs: test: runs-on: ubuntu-latest needs: compile - continue-on-error: ${{ matrix.python-version == '3.13' || matrix.python-version == 'pypy-3.11' }} strategy: fail-fast: false matrix: @@ -88,62 +89,68 @@ jobs: - sdist - source python-version: - - "3.8" - - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" - - "pypy-3.8" - - "pypy-3.9" - - "pypy-3.10" - # - "pypy-3.11" - # don't enable graal because it's slower than even pypy and - # fails because oracle/graalpython#385 - # - "graalpy-23" + - "3.14" + - "3.14t" + - "pypy-3.11" + - "graalpy-25" include: - source: sdist artifact: dist/*.tar.gz - source: wheel artifact: dist/*.whl + - opts: "" + - python-version: graalpy-25 + opts: "--experimental-options --engine.CompileOnly='~tregex re'" steps: - name: Checkout working copy - uses: actions/checkout@v4 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 with: submodules: true + fetch-depth: 0 + persist-credentials: false - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # 6.1.0 with: python-version: ${{ matrix.python-version }} allow-prereleases: true - - name: Install test dependencies - run: | - python -mpip install --upgrade pip - # cyaml is outright broken on pypy - if ! ${{ startsWith(matrix.python-version, 'pypy-') }}; then - # if binary wheels are not available for the current - # package install libyaml-dev so we can install pyyaml - # from source - if ! pip download --only-binary pyyaml -rrequirements_dev.txt > /dev/null 2>&1; then - sudo apt install libyaml-dev - fi + - run: python -mpip install --upgrade pip + - run: | + # if binary wheels are not available for the current + # package install libyaml-dev so we can install pyyaml + # from source + if ! pip download --only-binary :all: pyyaml > /dev/null 2>&1; then + sudo apt install libyaml-dev fi - python -mpip install pytest pyyaml - - # re2 is basically impossible to install from source so don't - # bother, and suppress installation failure so the test does - # not fail (re2 tests will just be skipped for versions / - # implementations for which google does not provide a binary - # wheel) - python -mpip install --only-binary :all: google-re2 || true + - run: python -mpip install pytest pyyaml + - run: python -mpip install ./ua-parser-builtins + # install rs accelerator if available, ignore if not + - run: python -mpip install ua-parser-rs || true + # re2 is basically impossible to install from source so don't + # bother, and suppress installation failure so the test does + # not fail (re2 tests will just be skipped for versions / + # implementations for which google does not provide a binary + # wheel) + - run: 'python -mpip install --only-binary :all: google-re2 || true' - name: download ${{ matrix.source }} artifact if: matrix.artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # 7.0.0 with: name: ${{ matrix.source }} path: dist/ - name: install package in environment - run: | - pip install ${{ matrix.artifact || '.' }} + run: python -m pip install ${{ matrix.artifact || '.' }} - name: run tests - run: pytest -v -Werror -Wignore::ImportWarning --doctest-glob="*.rst" -ra + run: python ${{ matrix.opts }} -m pytest -v -Werror -Wignore::ImportWarning --doctest-glob="*.rst" -ra + # necessary to create a unified CI result and not have to update + # branch protection rules every time the matrix gets updated + results: + name: "CI Results" + needs: ["checks", "test"] + runs-on: ubuntu-latest + permissions: {} + steps: + - run: exit 0 diff --git a/.github/workflows/release-builtins.yml b/.github/workflows/release-builtins.yml new file mode 100644 index 00000000..fa54a251 --- /dev/null +++ b/.github/workflows/release-builtins.yml @@ -0,0 +1,126 @@ +name: Publish ua-parser builtins + +run-name: Publish ${{ inputs.tag || 'master' }} to ${{ inputs.environment || 'pypi (scheduled)' }} + +permissions: {} + +on: + schedule: + # schedule a dev release on every 1st of the month, at 2034 UTC + - cron: "34 20 1 * *" + workflow_dispatch: + inputs: + tag: + description: "uap-core ref to release" + type: string + environment: + description: "environment to release for (testpypi or pypi)" + type: environment + required: true + +jobs: + build: + name: Build distribution + runs-on: ubuntu-latest + outputs: + release: ${{ steps.check.outputs.release }} + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + with: + submodules: true + fetch-depth: 0 + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # 6.1.0 + with: + python-version: "3.x" + + - name: Check necessity of release + id: check + env: + PYPI: ${{ github.event.inputs.environment || 'pypi' }} + REF: ${{ inputs.tag || 'HEAD' }} + run: | + case $PYPI in + pypi) + DOMAIN=pypi.org + ;; + testpypi) + DOMAIN=test.pypi.org + ;; + *) + exit 1 + esac + + RELREV=$(python scripts/relrev.py --domain "$DOMAIN") + VERSION=$(date +%Y%m) + CURREV=$(python scripts/tagcore.py --ref $REF --version $VERSION) + + if [ -n "$CURREV" -a "$RELREV" = "$CURREV" ] + then + echo "current rev matches latest release, skip new release" + else + echo release=true >> $GITHUB_OUTPUT + fi + - name: Install pypa/build + if: ${{ steps.check.outputs.release == 'true' }} + run: python3 -m pip install build --user + - name: Build wheel + if: ${{ steps.check.outputs.release == 'true' }} + run: | + python3 -m build -w ua-parser-builtins + mv ua-parser-builtins/dist . + - name: Store the distribution packages + if: ${{ steps.check.outputs.release == 'true' }} + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0 + with: + name: python-package-distributions + path: dist/ + + publish-to-testpypi: + name: Publish to TestPyPI + needs: build + if: ${{ github.event.inputs.environment == 'testpypi' && needs.build.outputs.release == 'true' }} + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/ua-parser-builtins + + permissions: + id-token: write + + steps: + - name: Download all the dists + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # 7.0.0 + with: + name: python-package-distributions + path: dist/ + - name: Publish + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # 1.13.0 + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true + verbose: true + + publish-to-pypi: + name: publish + needs: build + if: ${{ (github.event_name == 'schedule' || github.event.inputs.environment == 'pypi') && needs.build.outputs.release == 'true' }} + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/ua-parser-builtins + permissions: + id-token: write + + steps: + - name: Download all the dists + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # 7.0.0 + with: + name: python-package-distributions + path: dist/ + - name: Publish + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # 1.13.0 + with: + verbose: true diff --git a/.github/workflows/release-main.yml b/.github/workflows/release-main.yml new file mode 100644 index 00000000..98c17306 --- /dev/null +++ b/.github/workflows/release-main.yml @@ -0,0 +1,50 @@ +name: Publish ua-parser + +on: + workflow_dispatch: + release: + types: [created] + +permissions: {} + +env: + ENVNAME: ${{ github.event_name == 'release' && 'pypi' || 'testpypi' }} + +jobs: + release: + runs-on: ubuntu-latest + + environment: + name: ${{ github.event_name == 'release' && 'pypi' || 'testpypi' }} + url: https://${{ github.event_name != 'release' && 'test.' || '' }}pypi.org/p/ua-parser + + permissions: + id-token: write + + steps: + - name: Checkout working copy + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # 6.1.0 + with: + python-version: "3.x" + - name: Install dependency + run: | + python -mpip install --upgrade pip + python -mpip install build + - name: Build sdist and wheel + run: python -mbuild + - name: Publish to testpypi + if: ${{ env.ENVNAME == 'testpypi' }} + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # 1.13.0 + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true + verbose: true + - name: Publish to pypi + if: ${{ env.ENVNAME == 'pypi' }} + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # 1.13.0 + with: + verbose: true diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml new file mode 100644 index 00000000..f94fa7be --- /dev/null +++ b/.github/workflows/zizmor.yml @@ -0,0 +1,34 @@ +name: Zizmor + +on: + push: + pull_request: + +permissions: {} + +jobs: + zizmor: + runs-on: ubuntu-latest + permissions: + security-events: write + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # 6.0.1 + with: + persist-credentials: false + + - name: Install the latest version of uv + uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # 7.1.6 + + - name: Run zizmor + run: uvx zizmor --format sarif . > results.sarif + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload SARIF file + uses: github/codeql-action/upload-sarif@5d4e8d1aca955e8d8589aabd499c5cae939e33c7 # 4.31.9 + with: + sarif_file: results.sarif + category: zizmor diff --git a/.github/zizmor.yml b/.github/zizmor.yml new file mode 100644 index 00000000..62ab71e7 --- /dev/null +++ b/.github/zizmor.yml @@ -0,0 +1,4 @@ +rules: + use-trusted-publishing: + ignore: + - release-main.yml # can't do that until pypi/support#6661 diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..1eec0722 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,32 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +# needed for uap-core to get initialised properly +submodules: + include: all + +build: + os: ubuntu-24.04 + tools: + python: "3.12" + jobs: + post_checkout: + # rtfd doesn't retrieve tags by default, but we need them for `git + # describe` in order to build the parser builtins + # FIXME: remove once upb is published and can be installed from pypi) + - git fetch --unshallow --tags || true + +python: + install: + - method: pip + path: . + extra_requirements: + - yaml + - regex + - re2 + +sphinx: + configuration: doc/conf.py + fail_on_warning: true diff --git a/MANIFEST.in b/MANIFEST.in index 9c004de5..d8aabc5b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,12 @@ -exclude .* -prune .github -global-exclude *~ - include README.rst include LICENSE + graft uap-core -exclude uap-core/.* -recursive-exclude uap-core *.js + +prune .github +prune uap-core/.github +global-exclude *~ +global-exclude .* +global-exclude *.js +global-exclude __pycache__ +global-exclude *.py[co] diff --git a/Makefile b/Makefile deleted file mode 100644 index 0604a25c..00000000 --- a/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -all: test - -test: - tox - -clean: - @find . -name '*.pyc' -delete - @rm -rf tmp \ - src/ua_parser.egg-info \ - dist \ - build \ - src/ua_parser/_regexes.py -format: - @black . - -release: clean - pyproject-build - twine upload -s dist/* - -.PHONY: all test clean format release diff --git a/README.rst b/README.rst index 096a6471..17b405c7 100644 --- a/README.rst +++ b/README.rst @@ -8,39 +8,37 @@ Build Status ------------ .. image:: https://github.com/ua-parser/uap-python/actions/workflows/ci.yml/badge.svg + :target: https://github.com/ua-parser/uap-python/actions/workflows/ci.yml?query=branch%3Amaster :alt: CI on the master branch -⚠️ THIS IS NOT THE DOCUMENTATION YOU ARE LOOKING FOR (probably) ⚠️ ------------------------------------------------------------------- - -This is the readme for the `future 1.0 `_. - -For the current releases, see `the 0.x branch -`_. +.. image:: https://readthedocs.org/projects/uap-python/badge/?version=latest + :target: https://uap-python.readthedocs.io/ + :alt: Documentation Status Installing ---------- -Just add ``ua-parser`` to your project's dependencies, or run +Add ``ua-parser[regex]`` to your project's dependencies, or run .. code-block:: sh - $ pip install ua-parser + $ pip install 'ua-parser[regex]' to install in the current environment. -Installing `google-re2 `_ is -*strongly* recommended as it leads to *significantly* better -performances. This can be done directly via the ``re2`` optional -dependency: +ua-parser supports CPython 3.9 and newer, recent pypy (supporting +3.10), and GraalPy 24. -.. code-block:: sh +.. note:: + + The ``[regex]`` feature is *strongly* recommended, the Pure python + (no feature) is *significantly* slower, especially on non-cpython + runtimes, though it is the most memory efficient. - $ pip install 'ua_parser[re2]' + See `builtin resolvers`_ for more explanation of the tradeoffs + between the different options. -If ``re2`` is available, ``ua-parser`` will simply use it by default -instead of the pure-python resolver. +.. _builtin resolvers: https://uap-python.readthedocs.io/stable/guides.html#builtin-resolvers Quick Start ----------- @@ -106,3 +104,10 @@ Extract device information from user-agent string >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36' >>> parse_device(ua_string) Device(family='Mac', brand='Apple', model='Mac') + +Upgrading +--------- + +Upgrading from 0.x? See `the upgrade guide`_. + +.. _the upgrade guide: https://uap-python.readthedocs.io/stable/advanced/migration.html diff --git a/doc/Makefile b/doc/Makefile deleted file mode 100644 index d4bb2cbb..00000000 --- a/doc/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/advanced/caches.rst b/doc/advanced/caches.rst index c804d7c2..9f09701a 100644 --- a/doc/advanced/caches.rst +++ b/doc/advanced/caches.rst @@ -45,14 +45,14 @@ exercises the caches themselves and barely looks at the data. ---------------------------- ``bench`` is much more expensive in both CPU and wallclock as it -actually runs the base resolvers, combined with various caches of -various sizes. For usability, it can report its data (the average -parse time per input entry) in both human-readable text with one -result per line and CSV with resolver configurations as the columns -and cache sizes as the rows. +actually runs the resolvers on the sample file, combined with various +caches of various sizes. For usability, it can report its data (the +average parse time per input entry) in both human-readable text with +one result per line and CSV with resolver configurations as the +columns and cache sizes as the rows. ``hitrates`` is generally sufficient as generally speaking for the -same base resolver performances tend to more or less follo hit rates: +same base resolver performances tend to more or less follow hit rates: a cache hit is close to free compared to a cache miss. Although this is truer for the basic resolver (for which misses tend to be very expensive). ``bench`` is mostly useful to validate or tie-break diff --git a/doc/advanced/migration.rst b/doc/advanced/migration.rst index c77b5d89..a75d85e5 100644 --- a/doc/advanced/migration.rst +++ b/doc/advanced/migration.rst @@ -5,11 +5,13 @@ From 0.x to 1.0 Don't Touch A Thing =================== -The first and simplest way to transition is to not transition: the 0.x -API won't be removed for a long time, possibly ever. While it is -unlikely to get updated any further and will eventually (hopefully?) -fall behind, if you can't be arsed you probably don't have to until an -unlikely 2.0. +The first and simplest way to upgrade is to not do anything: the 0.x +API is still present in 1.x and won't be removed for a long time, +possibly ever. + +While it is unlikely to get updated any further and will eventually +(hopefully?) fall behind, if you can't be arsed you probably don't +have to do anything for now, or just now. Unavoidable Divergences ======================= @@ -29,20 +31,20 @@ special attention: # force initialisation of global parser ua_parser.parser -- The 1.0 API defaults to an :class:`re2-based parser - ` if |re2|_ is installed, although it seems - unlikely you may wish to consider replacing it with configuring a - :class:`~ua_parser.Parser` with a :class:`ua_parser.basic.Resolver` - if |re2|_ is one of your dependencies. +- The 1.0 API defaults to powerful native parsers (based on |regex|_ + or |re2|_) if available, although it seems unlikely you may wish to + consider replacing it with configuring a :class:`~ua_parser.Parser` + with a :class:`ua_parser.basic.Resolver`, especially if for some + reason |re2| is already one of your dependencies but you want to + *avoid* the |re2|-based resolver. Default Ruleset =============== While the 1.0 API was designed to better respect :pep:`8` and support -:mod:`typing`, it was also designed to easily be transitioned from. +:mod:`typing`, it was also designed to easily be transitioned to. -Given a 0.x API not using YAML, the conversion should be very easy and -consists of: +Given a 0.x API not using YAML, the conversion consists of: - updating the import from ``ua_parser.user_agent_parser`` to just ``ua_parser`` @@ -116,7 +118,7 @@ Legacy YAML support can be added via a pretty small shim:: import ua_parser from ua_parser.loaders import load_yaml - if yaml_path = os.environ.get("UA_PARSER_YAML"): + if yaml_path := os.environ.get("UA_PARSER_YAML"): ua_parser.parser = ua_parser.Parser.from_matchers( load_yaml(yaml_path)) diff --git a/doc/api.rst b/doc/api.rst index 18a7d484..c77eee47 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -75,6 +75,20 @@ from user agent strings. .. warning:: Only available if |re2|_ is installed. +.. class:: ua_parser.regex.Resolver(Matchers) + + An advanced resolver based on `Rust's regex + `_ and a bespoke implementation + of regex prefiltering, by the sibling project `uap-rust + `_. + + Sufficiently fast that a cache may not be necessary, and may even + be detrimental at smaller cache sizes + + .. warning:: Only available if `ua-parser-rs + _ is + installed. + Eager Matchers '''''''''''''' diff --git a/doc/conf.py b/doc/conf.py index f0d38386..cc076432 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,9 +19,11 @@ rst_epilog = """ .. |pyyaml| replace:: ``PyYaml`` .. |re2| replace:: ``google-re2`` +.. |regex| replace:: ``regex`` .. _pyyaml: https://pyyaml.org .. _re2: https://pypi.org/project/google-re2 +.. _regex: https://pypi.org/project/ua-parser-rs """ # -- General configuration --------------------------------------------------- diff --git a/doc/guides.rst b/doc/guides.rst index b216d18a..9ea323e2 100644 --- a/doc/guides.rst +++ b/doc/guides.rst @@ -7,10 +7,8 @@ Guides Custom Rulesets =============== -ua-parser defaults to the version of `ua-core -`_ -current when it was packaged, using a precompiled version of -``regexes.yaml``. +ua-parser defaults to the latest stable release of `ua-core`_ via +`precompiled regexes.yaml`__. That is a suitable defaut, but there are plenty of reasons to use custom rulesets: @@ -18,10 +16,13 @@ custom rulesets: - trim down the default ruleset to only the most current or relevant rules for efficiency e.g. you might not care about CalDav or podcast applications -- add new rules relevant to your own traffic but which don't (possibly - can't) be in the main project +- add new rules relevant to your own traffic but which aren't (possibly + can't be) in the main project - experiment with the creation of new rules - use a completely bespoke ruleset to track UA-identified API clients +- use "experimental" rules which haven't been released yet (although + `ua-parser-builtins`_ provides regular prerelease versions which may + be suitable for this) ua-parser provides easy ways to load custom rolesets: @@ -38,6 +39,12 @@ ua-parser provides easy ways to load custom rolesets: parser = Parser.from_matchers(load_yaml("regexes.yaml")) parser.parse(some_ua) +.. _ua-parser-builtins: https://pypi.org/project/ua-parser-builtins + +__ ua-parser-builtins_ + +.. _ua-core: https://github.com/ua-parser/uap-core/blob/master/regexes.yaml + .. _guide-custom-global-parser: Custom Global Parser @@ -86,10 +93,10 @@ composing :class:`~ua_parser.Resolver` objects. The most basic such customisation is simply configuring caching away from the default setup. -As an example, in the default configuration if |re2|_ is available the -RE2-based resolver is not cached, a user might consider the memory -investment worth it and want to reconfigure the stack for a cached -base. +As an example, in the default configuration if |regex|_ is available +the regex-based resolver is not cached, a user might consider the +memory investment worth it and want to reconfigure the stack for a +cached base. The process is uncomplicated as the APIs are designed to compose together. @@ -98,8 +105,8 @@ The first step is to instantiate a base resolver, instantiated with the relevant :class:`Matchers` data:: import ua_parser.loaders - import ua_parser.re2 - base = ua_parser.re2.Resolver( + import ua_parser.regex + base = ua_parser.regex.Resolver( ua_parser.loaders.load_lazy_builtins()) The next step is to instantiate the cache [#cache]_ suitably @@ -129,6 +136,107 @@ from here on:: :class:`~ua_parser.caching.Local`, which is also caching-related, and serves to use thread-local caches rather than a shared cache. +.. _builtin-resolvers: + +Builtin Resolvers +================= + +.. list-table:: + :header-rows: 1 + :stub-columns: 1 + + * - + - speed + - portability + - memory use + - safety + * - ``regex`` + - great + - good + - fine + - great + * - ``re2`` + - good + - bad + - good + - good + * - ``basic`` + - terrible + - great + - great + - great + +``regex`` +--------- + +The ``regex`` resolver is a bespoke effort as part of the `uap-rust +`_ sibling project, built on +`rust-regex `_ and `a bespoke +regex-prefiltering implementation +`_, +it: + +- Is the fastest available resolver, usually edging out ``re2`` by a + significant margin (when that is even available). +- Is fully controlled by the project, and thus can be built for all + interpreters and platforms supported by pyo3 (currently: cpython, + pypy, and graalpy, on linux, macos and linux, intel and arm). It is + also built as a cpython abi3 wheel and should thus suffer from no + compatibility issues with new releases of cpython at least. +- Built entirely out of safe rust code, its safety risks are entirely + in ``regex`` and ``pyo3``. +- Uses somewhat more memory than the other resolvers (~85MB high water + mark on a real-world dataset). + +If available, it is the default resolver, without a cache. + +``re2`` +------- + +The ``re2`` resolver is built atop the widely used `google-re2 +`_ via its built-in Python bindings. +It: + +- Is quite fast, though only about half the speed of ``regex`` on + real-world data. +- Is only compatible with CPython, and uses pure API wheels, so needs + a different release for each cpython version, for each OS, for each + architecture. +- Is built entirely in C++, but by experienced Google developers. +- Is more memory intensive than the pure-python ``basic`` resolver, + but quite slim all things considered (~55MB high water mark on a + real-world dataset). + +If available, it is the second-preferred resolver, without a cache. + +At the end of the day, it is really only useful if the codebase +already uses ``re2``. + +``basic`` +--------- + +The ``basic`` resolver is a naive linear traversal of all rules, using +the standard library's ``re``. It: + +- Is *extremely* slow: about 10x slower than ``re2`` on cpython, and + pypy and graal's regex implementations do *not* like the workload + and are 3x-4x slower than *cpython*. +- Has perfect compatibility, with the caveat above, by virtue of being + built entirely out of standard library code. +- Is basically as safe as Python software can be by virtue of being + just Python, with the native code being the standard library's. +- Is the slimmest resolver at about 40MB. + +This is caveated by a hard requirement to use caches which makes it +workably faster on real-world datasets (if still nowhere near +*uncached* ``re2`` or ``regex``) but increases its memory requirement +significantly e.g. using "sieve" and a cache size of 20000 on a +real-world dataset, it is about 4x slower than ``re2`` for about the +same memory requirements. + +It is the fallback and least preferred resolver, with a medium +(currently 2000 entries) cache by default. + Writing Custom Resolvers ======================== @@ -187,7 +295,7 @@ could then use something like:: Parser(FallbackResolver([ foo_resolver, - re2.Resolver(load_lazy_builtins()), + regex.Resolver(load_lazy_builtins()), ])) to prioritise cheap resolving of our application while still resolving diff --git a/doc/installation.rst b/doc/installation.rst index e8ca58d9..09ba9e83 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -2,32 +2,30 @@ Installation ============ -Python Version -============== - -ua-parser currently supports Python 3.8 and newer, as well as recent -versions of PyPy supporting the same standards. - -.. note:: While PyPy is supported, it is not *fast*, and google-re2 is - not supported on it. - Installation ============ .. include:: ../README.rst - :start-line: 23 + :start-line: 14 :end-before: Quick Start Optional Dependencies ===================== -ua-parser currently has two optional dependencies, |re2|_ and -|pyyaml|_. These dependencies will be detected and used automatically +ua-parser currently has three optional dependencies, |regex|_, |re2|_ and +|pyyaml|_. These dependencies will be detected and used augitomatically if installed, but can also be installed via and alongside ua-parser: .. code-block:: sh + $ pip install 'ua-parser[regex]' $ pip install 'ua-parser[re2]' $ pip install 'ua-parser[yaml]' - $ pip install 'ua-parser[re2,yaml]' + $ pip install 'ua-parser[regex,yaml]' + +``yaml`` enables the ability to :func:`load rulesets from yaml +`. +The other two features enable more efficient resolvers. By default, +``ua-parser`` will select the fastest resolver it finds out of the +available set (regex > re2 > python). diff --git a/doc/quickstart.rst b/doc/quickstart.rst index 4267ffe0..c6cfe9cf 100644 --- a/doc/quickstart.rst +++ b/doc/quickstart.rst @@ -3,4 +3,4 @@ Quick Start =========== .. include:: ../README.rst - :start-line: 47 + :start-line: 44 diff --git a/pyproject.toml b/pyproject.toml index 920fcd0f..d2421d62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,14 +5,12 @@ build-backend = "setuptools.build_meta" [project] name = "ua-parser" description = "Python port of Browserscope's user agent parser" -version = "1.0.0a1" +version = "1.0.1" readme = "README.rst" -requires-python = ">=3.8" -dependencies = [] -optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } +requires-python = ">=3.10" +dependencies = ["ua-parser-builtins"] license = {text = "Apache 2.0"} -urls = {repository = "https://github.com/ua-parser/uap-python"} authors = [ { name = "Stephen Lamm", email = "slamm@google.com"}, @@ -26,7 +24,7 @@ maintainers = [ ] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Environment :: Web Environment", "Intended Audience :: Developers", "Operating System :: OS Independent", @@ -35,12 +33,35 @@ classifiers = [ "Topic :: Internet :: WWW/HTTP", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy" + "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: Implementation :: GraalPy", +] + +[project.urls] +documentation = "https://uap-python.readthedocs.io" +repository = "https://github.com/ua-parser/uap-python" +issues = "https://github.com/ua-parser/uap-python/issues" + +[project.optional-dependencies] +yaml = ["PyYaml"] +re2 = ["google-re2"] +regex = ["ua-parser-rs"] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"ua_parser" = ["py.typed"] + +[tool.ruff] +exclude = [ + "src/ua_parser/generate_builtins.py", ] [tool.ruff.lint] @@ -56,8 +77,11 @@ classes = ["OS"] known-first-party = ["ua_parser"] combine-as-imports = true +[tool.ruff.lint.per-file-ignores] +"tests/test_legacy.py" = ["PT030"] + [tool.mypy] -python_version = "3.8" +python_version = "3.10" files = "src,tests" # can't use strict because it's only global @@ -68,7 +92,7 @@ warn_redundant_casts = true # these can be overridden (maybe?) strict_equality = true -strict_concatenate = true +# strict_concatenate = true check_untyped_defs = true disallow_subclassing_any = true disallow_untyped_decorators = true @@ -92,6 +116,7 @@ module = [ "test_core", "test_caches", "test_parsers_basics", + "test_fa_simplifier", ] #check_untyped_defs = false diff --git a/scripts/relrev.py b/scripts/relrev.py new file mode 100644 index 00000000..0979a407 --- /dev/null +++ b/scripts/relrev.py @@ -0,0 +1,67 @@ +import argparse +import contextlib +import hashlib +import json +import re +import shutil +import sys +import tempfile +import zipfile +from urllib import parse, request + +parser = argparse.ArgumentParser( + description="Retrieves the revision for the latest release of ua-parser-builtins", +) +parser.add_argument( + "--domain", + default="pypi.org", +) +args = parser.parse_args() + +url = parse.urlunsplit(("https", args.domain, "simple/ua-parser-builtins", "", "")) + +print("checking", url, file=sys.stderr) +res = request.urlopen( + request.Request( + url, + headers={ + "Accept": "application/vnd.pypi.simple.v1+json", + }, + ) +) +if res.status != 200: + exit(f"Failed to retrieve project distributions: {res.status}") + +distributions = json.load(res) +version, distribution = next( + (v, d) + for v, d in zip( + reversed(distributions["versions"]), reversed(distributions["files"]) + ) + if not d["yanked"] + if re.fullmatch( + r"(\d+!)?\d+(\.\d+)*(\.post\d+)?", + v, + flags=re.ASCII, + ) +) +print("latest version:", version, file=sys.stderr) + +res = request.urlopen(distribution["url"]) +if res.status != 200: + exit(f"Failed to retrieve wheel: {res.status}") + +with tempfile.SpooledTemporaryFile(256 * 1024) as tf: + shutil.copyfileobj(res, tf) + for name, val in distribution["hashes"].items(): + tf.seek(0) + d = hashlib.file_digest(tf, name).hexdigest() + if d != val: + exit(f"{name} mismatch: expected {val!r} got {d!r}") + tf.seek(0) + with zipfile.ZipFile(tf) as z: + # if the REVISION file is not found then it's fine it's a + # pre-calver release (hopefully) and that means we should cut + # a calver one + with contextlib.suppress(KeyError): + print(z.read("REVISION").decode()) diff --git a/scripts/tagcore.py b/scripts/tagcore.py new file mode 100644 index 00000000..a5ef7f88 --- /dev/null +++ b/scripts/tagcore.py @@ -0,0 +1,72 @@ +import argparse +import datetime +import pathlib +import shutil +import subprocess + +CORE_REMOTE = "https://github.com/ua-parser/uap-core" + + +parser = argparse.ArgumentParser( + description="""Updates `uap-core` to `ref` and tags it with `version` + +If successful, writes the commit to `REVISION` and prints it to stdout. +""" +) +parser.add_argument( + "--ref", + default="HEAD", + help="uap-core ref to build, defaults to HEAD (the head of the default branch)", +) +parser.add_argument( + "--version", + help="version to tag the package as, defaults to an YMD calendar version matching the ref's commit date", +) +args = parser.parse_args() + + +if not shutil.which("git"): + exit("git required") + +r = subprocess.run( + ["git", "ls-remote", CORE_REMOTE, args.ref], + encoding="utf-8", + stdout=subprocess.PIPE, +) +if r.returncode: + exit("Unable to query uap-core repo") + +if r.stdout: + if r.stdout.count("\n") > 1: + exit(f"Found multiple matching refs for {args.ref}:\n{r.stdout}") + commit, _rest = r.stdout.split("\t", 1) +else: + try: + int(args.ref, 16) + commit = args.ref + except ValueError: + exit(f"Unknown or invalid ref {args.ref!r}") + +CORE_PATH = pathlib.Path(__file__).resolve().parent.parent / "uap-core" + +r = subprocess.run(["git", "-C", CORE_PATH, "fetch", CORE_REMOTE, commit]) +if r.returncode: + exit(f"Unable to retrieve commit {commit!r}") + +if args.version: + tagname = args.version +else: + r = subprocess.run( + ["git", "-C", CORE_PATH, "show", "-s", "--format=%cs", commit], + encoding="utf-8", + stdout=subprocess.PIPE, + ) + if r.returncode or not r.stdout: + exit(f"Unable to retrieve commit date from commit {commit!r}") + + tagname = datetime.date.fromisoformat(r.stdout.rstrip()).strftime("%Y%m%d") + +subprocess.run(["git", "-C", CORE_PATH, "switch", "-d", commit]) +subprocess.run(["git", "-C", CORE_PATH, "tag", tagname, commit]) +CORE_PATH.joinpath("REVISION").write_text(commit) +print(commit) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 9b07aee0..00000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[options] -packages = find: -package_dir = - =src -setup_requires = pyyaml - -[options.packages.find] -where = src diff --git a/setup.py b/setup.py deleted file mode 100644 index c6947780..00000000 --- a/setup.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python -# flake8: noqa -import io -from contextlib import suppress, contextmanager -from os import fspath -from pathlib import Path -from typing import Optional, List, Dict - -from setuptools import setup, Command, find_namespace_packages -from setuptools.command.build import build, SubCommand -from setuptools.command.editable_wheel import editable_wheel - -import yaml - - -build.sub_commands.insert(0, ("compile-regexes", None)) - - -class CompileRegexes(Command, SubCommand): - def initialize_options(self) -> None: - self.pkg_name: Optional[str] = None - - def finalize_options(self) -> None: - self.pkg_name = self.distribution.get_name().replace("-", "_") - - def get_source_files(self) -> List[str]: - return ["uap-core/regexes.yaml"] - - def get_outputs(self) -> List[str]: - return [f"{self.pkg_name}/_regexes.py"] - - def get_output_mapping(self) -> Dict[str, str]: - return dict(zip(self.get_source_files(), self.get_outputs())) - - def run(self) -> None: - # FIXME: check git / submodules? - """ - work_path = self.work_path - if not os.path.exists(os.path.join(work_path, ".git")): - return - - log.info("initializing git submodules") - check_output(["git", "submodule", "init"], cwd=work_path) - check_output(["git", "submodule", "update"], cwd=work_path) - """ - if not self.pkg_name: - return # or error? - - yaml_src = Path("uap-core", "regexes.yaml") - if not yaml_src.is_file(): - raise RuntimeError( - f"Unable to find regexes.yaml, should be at {yaml_src!r}" - ) - - with yaml_src.open("rb") as f: - regexes = yaml.safe_load(f) - - if self.editable_mode: - dist_dir = Path("src") - else: - dist_dir = Path(self.get_finalized_command("bdist_wheel").bdist_dir) - - outdir = dist_dir / self.pkg_name - outdir.mkdir(parents=True, exist_ok=True) - - dest = outdir / "_matchers.py" - dest_lazy = outdir / "_lazy.py" - dest_legacy = outdir / "_regexes.py" - - with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open( - "wb" - ) as legacy: - eager = EagerWriter(eager) - lazy = LazyWriter(lazy) - legacy = LegacyWriter(legacy) - - for section in ["user_agent_parsers", "os_parsers", "device_parsers"]: - with eager.section(section), lazy.section(section), legacy.section( - section - ): - extract = EXTRACTORS[section] - for p in regexes[section]: - el = trim(extract(p)) - eager.item(el) - lazy.item(el) - legacy.item(el) - eager.end() - lazy.end() - legacy.end() - - -def trim(l): - while len(l) > 1 and l[-1] is None: - l.pop() - return l - - -EXTRACTORS = { - "user_agent_parsers": lambda p: [ - p["regex"], - p.get("family_replacement"), - p.get("v1_replacement"), - p.get("v2_replacement"), - ], - "os_parsers": lambda p: [ - p["regex"], - p.get("os_replacement"), - p.get("os_v1_replacement"), - p.get("os_v2_replacement"), - p.get("os_v3_replacement"), - p.get("os_v4_replacement"), - ], - "device_parsers": lambda p: [ - p["regex"], - p.get("regex_flag"), - p.get("device_replacement"), - p.get("brand_replacement"), - p.get("model_replacement"), - ], -} - - -class Writer: - section_end = b"" - - def __init__(self, fp): - self.fp = fp - self.fp.write( - b"""\ -######################################################## -# NOTICE: this file is autogenerated from regexes.yaml # -######################################################## -""" - ) - self.fp.write(self.prefix) - self._section = None - - @contextmanager - def section(self, id): - self._section = id - self.fp.write(self.sections[id]) - yield - self.fp.write(self.section_end) - - def item(self, elements): - # DeviceMatcher(re, flag, repl1), - self.fp.write(self.items[self._section]) - self.fp.write(", ".join(map(repr, elements)).encode()) - self.fp.write(b"),\n") - - def end(self): - self.fp.write(self.suffix) - - -class LegacyWriter(Writer): - prefix = b"""\ -__all__ = [ - "USER_AGENT_PARSERS", - "DEVICE_PARSERS", - "OS_PARSERS", -] - -from .user_agent_parser import UserAgentParser, DeviceParser, OSParser - -""" - sections = { - "user_agent_parsers": b"USER_AGENT_PARSERS = [\n", - "os_parsers": b"\n\nOS_PARSERS = [\n", - "device_parsers": b"\n\nDEVICE_PARSERS = [\n", - } - section_end = b"]" - items = { - "user_agent_parsers": b" UserAgentParser(", - "os_parsers": b" OSParser(", - "device_parsers": b" DeviceParser(", - } - suffix = b"\n" - - -class EagerWriter(Writer): - prefix = b"""\ -__all__ = ["MATCHERS"] - -from typing import Tuple, List -from .matchers import UserAgentMatcher, OSMatcher, DeviceMatcher - -MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ -""" - sections = { - "user_agent_parsers": b"", - "os_parsers": b"], [\n", - "device_parsers": b"], [\n", - } - items = { - "user_agent_parsers": b" UserAgentMatcher(", - "os_parsers": b" OSMatcher(", - "device_parsers": b" DeviceMatcher(", - } - suffix = b"])\n" - - -class LazyWriter(EagerWriter): - prefix = b"""\ -__all__ = ["MATCHERS"] - -from typing import Tuple, List -from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher - -MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ -""" - - -setup( - cmdclass={ - "compile-regexes": CompileRegexes, - } -) diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index a9a09b47..5b5ba71b 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -20,17 +20,17 @@ from __future__ import annotations __all__ = [ + "OS", "BasicResolver", - "CachingResolver", "Cache", + "CachingResolver", "DefaultedResult", "Device", "Domain", "Matchers", - "OS", - "Result", - "Resolver", "PartialResult", + "Resolver", + "Result", "UserAgent", "load_builtins", "load_lazy_builtins", @@ -41,7 +41,8 @@ ] import importlib.util -from typing import Callable, Optional +import threading +from typing import Callable, Optional, cast from .basic import Resolver as BasicResolver from .caching import CachingResolver, S3Fifo as Cache @@ -57,13 +58,28 @@ UserAgent, ) from .loaders import load_builtins, load_lazy_builtins +from .utils import IS_GRAAL -Re2Resolver: Optional[Callable[[Matchers], Resolver]] = None +_ResolverCtor = Callable[[Matchers], Resolver] +Re2Resolver: Optional[_ResolverCtor] = None if importlib.util.find_spec("re2"): from .re2 import Resolver as Re2Resolver +RegexResolver: Optional[_ResolverCtor] = None +if importlib.util.find_spec("ua_parser_rs"): + from .regex import Resolver as RegexResolver +BestAvailableResolver: _ResolverCtor = next( + filter( + None, + ( + RegexResolver, + Re2Resolver, + lambda m: CachingResolver(BasicResolver(m), Cache(2000)), + ), + ) +) -VERSION = (1, 0, 0) +VERSION = (1, 0, 1) class Parser: @@ -81,15 +97,7 @@ def from_matchers(cls, m: Matchers, /) -> Parser: stack. """ - if Re2Resolver is not None: - return cls(Re2Resolver(m)) - else: - return cls( - CachingResolver( - BasicResolver(m), - Cache(200), - ) - ) + return cls(BestAvailableResolver(m)) def __init__(self, resolver: Resolver) -> None: self.resolver = resolver @@ -128,14 +136,27 @@ def parse_device(self: Resolver, ua: str) -> Optional[Device]: initialisation, rather than pay for it at first call. """ +_lazy_globals_lock = threading.Lock() + def __getattr__(name: str) -> Parser: global parser - if name == "parser": - parser = Parser.from_matchers( - load_builtins() if Re2Resolver is None else load_lazy_builtins() - ) - return parser + with _lazy_globals_lock: + if name == "parser": + # if two threads access `ua_parser.parser` before it's + # initialised, the second one will wait until the first + # one's finished by which time the parser global should be + # set and can be returned with no extra work + if p := globals().get("parser"): + return cast(Parser, p) + + if RegexResolver or Re2Resolver or IS_GRAAL: + matchers = load_lazy_builtins() + else: + matchers = load_builtins() + parser = Parser.from_matchers(matchers) + return parser + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py index d4ff29b2..047efaaa 100644 --- a/src/ua_parser/__main__.py +++ b/src/ua_parser/__main__.py @@ -11,7 +11,7 @@ import sys import threading import time -import tracemalloc +import types from typing import ( Any, Callable, @@ -38,12 +38,21 @@ ) from .caching import Cache, Local from .loaders import load_builtins, load_yaml -from .re2 import Resolver as Re2Resolver + +try: + from .re2 import Resolver as Re2Resolver +except ImportError: + pass +try: + from .regex import Resolver as RegexResolver +except ImportError: + pass from .user_agent_parser import Parse CACHEABLE = { "basic": True, "re2": True, + "regex": True, "legacy": False, } @@ -58,6 +67,17 @@ ] ) +try: + import tracemalloc +except ImportError: + snapshot = types.SimpleNamespace( + compare_to=lambda _1, _2: [], + ) + tracemalloc = types.SimpleNamespace( # type: ignore + start=lambda: None, + take_snapshot=lambda: snapshot, + ) + def get_rules(parsers: List[str], regexes: Optional[io.IOBase]) -> Matchers: if regexes: @@ -81,7 +101,7 @@ def run_stdout(args: argparse.Namespace) -> None: lines = list(args.file) count = len(lines) uniques = len(set(lines)) - print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques/count:.0%})") + print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques / count:.0%})") rules = get_rules(args.bases, args.regexes) @@ -178,6 +198,8 @@ def get_parser( r = BasicResolver(rules) elif parser == "re2": r = Re2Resolver(rules) + elif parser == "regex": + r = RegexResolver(rules) else: sys.exit(f"unknown parser {parser!r}") @@ -298,7 +320,7 @@ def belady(maxsize: int) -> Cache: overhead / cache_size, ) print( - f"{cache.__name__.lower():8}({cache_size:{w}}): {(total - misses.count)/total*100:2.0f}% hit rate {diff}" + f"{cache.__name__.lower():8}({cache_size:{w}}): {(total - misses.count) / total * 100:2.0f}% hit rate {diff}" ) del misses, parser @@ -327,6 +349,7 @@ def run_threaded(args: argparse.Namespace) -> None: ("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))), ("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))), ("re2", Re2Resolver(load_builtins())), + ("regex", RegexResolver(load_builtins())), ] for name, resolver in resolvers: print(f"{name:11}: ", end="", flush=True) @@ -355,7 +378,7 @@ def run_threaded(args: argparse.Namespace) -> None: totlines = len(lines) * args.threads # runtime in us t = (time.perf_counter_ns() - st) / 1000 - print(f"{t/totlines:>4.0f}us/line", flush=True) + print(f"{t / totlines:>4.0f}us/line", flush=True) EPILOG = """For good results the sample `file` should be an actual @@ -436,14 +459,14 @@ def __call__( bench.add_argument( "--bases", nargs="+", - choices=["basic", "re2", "legacy"], - default=["basic", "re2", "legacy"], + choices=["basic", "re2", "regex", "legacy"], + default=["basic", "re2", "regex", "legacy"], help="""Base resolvers to benchmark. `basic` is a linear search through the regexes file, `re2` is a prefiltered regex set - implemented in C++, `legacy` is the legacy API (essentially a - basic resolver with a clearing cache of fixed 200 entries, but - less layered so usually slightly faster than an equivalent - basic-based resolver).""", + implemented in C++, `regex` is a prefiltered regex set implemented + in Rust, `legacy` is the legacy API (essentially a basic resolver + with a clearing cache of fixed 200 entries, but less layered so + usually slightly faster than an equivalent basic-based resolver).""", ) bench.add_argument( "--caches", diff --git a/src/ua_parser/_lazy.pyi b/src/ua_parser/_lazy.pyi deleted file mode 100644 index 741db1af..00000000 --- a/src/ua_parser/_lazy.pyi +++ /dev/null @@ -1,11 +0,0 @@ -__all__ = ["MATCHERS"] - -from typing import List, Tuple - -from .lazy import DeviceMatcher, OSMatcher, UserAgentMatcher - -MATCHERS: Tuple[ - List[UserAgentMatcher], - List[OSMatcher], - List[DeviceMatcher], -] diff --git a/src/ua_parser/_matchers.pyi b/src/ua_parser/_matchers.pyi deleted file mode 100644 index 2269fb43..00000000 --- a/src/ua_parser/_matchers.pyi +++ /dev/null @@ -1,11 +0,0 @@ -__all__ = ["MATCHERS"] - -from typing import List, Tuple - -from .matchers import DeviceMatcher, OSMatcher, UserAgentMatcher - -MATCHERS: Tuple[ - List[UserAgentMatcher], - List[OSMatcher], - List[DeviceMatcher], -] diff --git a/src/ua_parser/_regexes.pyi b/src/ua_parser/_regexes.pyi deleted file mode 100644 index 10bc2ef4..00000000 --- a/src/ua_parser/_regexes.pyi +++ /dev/null @@ -1,7 +0,0 @@ -from typing import List - -from .user_agent_parser import DeviceParser, OSParser, UserAgentParser - -USER_AGENT_PARSERS: List[UserAgentParser] -OS_PARSERS: List[OSParser] -DEVICE_PARSERS: List[DeviceParser] diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py index bdc1e69e..00b49e15 100644 --- a/src/ua_parser/basic.py +++ b/src/ua_parser/basic.py @@ -1,7 +1,9 @@ __all__ = ["Resolver"] +import re +from itertools import chain from operator import methodcaller -from typing import List +from typing import Any, List from .core import ( Device, @@ -12,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import IS_GRAAL, fa_simplifier class Resolver: @@ -30,6 +33,24 @@ def __init__( matchers: Matchers, ) -> None: self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers + if IS_GRAAL: + matcher: Any + kind = next( + ( + "eager" if hasattr(type(m), "regex") else "lazy" + for m in chain.from_iterable(matchers) + ), + None, + ) + if kind == "eager": + for matcher in chain.from_iterable(matchers): + matcher.pattern = re.compile( + fa_simplifier(matcher.pattern.pattern), + flags=matcher.pattern.flags, + ) + elif kind == "lazy": + for matcher in chain.from_iterable(matchers): + matcher.regex = fa_simplifier(matcher.pattern.pattern) def __call__(self, ua: str, domains: Domain, /) -> PartialResult: parse = methodcaller("__call__", ua) diff --git a/src/ua_parser/caching.py b/src/ua_parser/caching.py index 706ad4b3..998c4b36 100644 --- a/src/ua_parser/caching.py +++ b/src/ua_parser/caching.py @@ -78,7 +78,7 @@ def __setitem__(self, key: str, value: PartialResult) -> None: @dataclasses.dataclass class CacheEntry: - __slots__ = ["key", "value", "freq"] + __slots__ = ["freq", "key", "value"] key: str value: PartialResult freq: int @@ -161,7 +161,7 @@ def _evict_small(self) -> None: @dataclasses.dataclass class SieveNode: - __slots__ = ("key", "value", "visited", "next") + __slots__ = ("key", "next", "value", "visited") key: str value: PartialResult visited: bool diff --git a/src/ua_parser/core.py b/src/ua_parser/core.py index 8ea880d6..b7133d4c 100644 --- a/src/ua_parser/core.py +++ b/src/ua_parser/core.py @@ -4,14 +4,14 @@ from typing import Generic, List, Optional, Protocol, Tuple, TypeVar __all__ = [ + "OS", "DefaultedResult", "Device", "Domain", "Matchers", - "OS", - "Result", "PartialResult", "Resolver", + "Result", "UserAgent", ] @@ -74,7 +74,7 @@ def __init__( class Device: """Device information parsed from the user agent string.""" - __slots__ = ("family", "brand", "model") + __slots__ = ("brand", "family", "model") family: str brand: Optional[str] model: Optional[str] @@ -172,7 +172,7 @@ class PartialResult: """ - __slots__ = ("domains", "user_agent", "os", "device", "string") + __slots__ = ("device", "domains", "os", "string", "user_agent") domains: Domain user_agent: Optional[UserAgent] os: Optional[OS] diff --git a/src/ua_parser/lazy.py b/src/ua_parser/lazy.py index c5aa5e23..4f0abedf 100644 --- a/src/ua_parser/lazy.py +++ b/src/ua_parser/lazy.py @@ -1,4 +1,4 @@ -__all__ = ["UserAgentMatcher", "OSMatcher", "DeviceMatcher"] +__all__ = ["DeviceMatcher", "OSMatcher", "UserAgentMatcher"] import re from functools import cached_property diff --git a/src/ua_parser/loaders.py b/src/ua_parser/loaders.py index 18fc3d25..55774eaf 100644 --- a/src/ua_parser/loaders.py +++ b/src/ua_parser/loaders.py @@ -52,7 +52,7 @@ def load_builtins() -> Matchers: further imports simply reference the existing datas. """ - from ._matchers import MATCHERS + from ua_parser_builtins.matchers import MATCHERS # typing and mypy don't have safe upcast (#5756) and mypy is # unhappy about returning concrete matchers for a mixed type @@ -66,7 +66,7 @@ def load_lazy_builtins() -> Matchers: further imports simply reference the existing datas. """ - from ._lazy import MATCHERS + from ua_parser_builtins.lazy import MATCHERS return cast(Matchers, MATCHERS) diff --git a/src/ua_parser/matchers.py b/src/ua_parser/matchers.py index 3956b3b5..35200b0b 100644 --- a/src/ua_parser/matchers.py +++ b/src/ua_parser/matchers.py @@ -1,4 +1,4 @@ -__all__ = ["UserAgentMatcher", "OSMatcher", "DeviceMatcher"] +__all__ = ["DeviceMatcher", "OSMatcher", "UserAgentMatcher"] import re from typing import Literal, Optional, Pattern diff --git a/src/ua_parser/py.typed b/src/ua_parser/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py index 83a4a148..1f17e225 100644 --- a/src/ua_parser/re2.py +++ b/src/ua_parser/re2.py @@ -14,6 +14,7 @@ PartialResult, UserAgent, ) +from .utils import fa_simplifier class DummyFilter: @@ -38,7 +39,7 @@ def __init__( if self.user_agent_matchers: self.ua = re2.Filter() for u in self.user_agent_matchers: - self.ua.Add(u.regex) + self.ua.Add(fa_simplifier(u.regex)) self.ua.Compile() else: self.ua = DummyFilter() @@ -46,7 +47,7 @@ def __init__( if self.os_matchers: self.os = re2.Filter() for o in self.os_matchers: - self.os.Add(o.regex) + self.os.Add(fa_simplifier(o.regex)) self.os.Compile() else: self.os = DummyFilter() @@ -58,9 +59,9 @@ def __init__( # no pattern uses global flags, but since they're not # supported in JS that seems safe. if d.flags & re.IGNORECASE: - self.devices.Add("(?i)" + d.regex) + self.devices.Add("(?i)" + fa_simplifier(d.regex)) else: - self.devices.Add(d.regex) + self.devices.Add(fa_simplifier(d.regex)) self.devices.Compile() else: self.devices = DummyFilter() diff --git a/src/ua_parser/regex.py b/src/ua_parser/regex.py new file mode 100644 index 00000000..704df160 --- /dev/null +++ b/src/ua_parser/regex.py @@ -0,0 +1,76 @@ +__all__ = ["Resolver"] + +from operator import attrgetter + +import ua_parser_rs # type: ignore + +from .core import ( + Device, + Domain, + Matchers, + OS, + PartialResult, + UserAgent, +) + + +class Resolver: + ua: ua_parser_rs.UserAgentExtractor + os: ua_parser_rs.OSExtractor + de: ua_parser_rs.DeviceExtractor + + def __init__(self, matchers: Matchers) -> None: + ua, os, de = matchers + self.ua = ua_parser_rs.UserAgentExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + ua, + ) + ) + self.os = ua_parser_rs.OSExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + os, + ) + ) + self.de = ua_parser_rs.DeviceExtractor( + map( + attrgetter("regex", "regex_flag", "family", "brand", "model"), + de, + ) + ) + + def __call__(self, ua: str, domains: Domain, /) -> PartialResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if m := self.ua.extract(ua): + user_agent = UserAgent( + m.family, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.OS in domains: + if m := self.os.extract(ua): + os = OS( + m.family, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.DEVICE in domains: + if m := self.de.extract(ua): + device = Device( + m.family, + m.brand, + m.model, + ) + return PartialResult( + domains=domains, + string=ua, + user_agent=user_agent, + os=os, + device=device, + ) diff --git a/src/ua_parser/user_agent_parser.py b/src/ua_parser/user_agent_parser.py index 5cb1c744..e6e4bb3e 100644 --- a/src/ua_parser/user_agent_parser.py +++ b/src/ua_parser/user_agent_parser.py @@ -521,4 +521,8 @@ def GetFilters( del SafeLoader else: # Just load our pre-compiled versions - from ._regexes import DEVICE_PARSERS, OS_PARSERS, USER_AGENT_PARSERS + from ua_parser_builtins.regexes import ( + DEVICE_PARSERS, + OS_PARSERS, + USER_AGENT_PARSERS, + ) diff --git a/src/ua_parser/utils.py b/src/ua_parser/utils.py index f3afa486..ac11c5a0 100644 --- a/src/ua_parser/utils.py +++ b/src/ua_parser/utils.py @@ -1,6 +1,9 @@ +import platform import re from typing import Match, Optional +IS_GRAAL: bool = platform.python_implementation() == "GraalVM" + def get(m: Match[str], idx: int) -> Optional[str]: return (m[idx] or None) if 0 < idx <= m.re.groups else None @@ -28,3 +31,33 @@ def replacer(repl: str, m: Match[str]) -> Optional[str]: return None return re.sub(r"\$(\d)", lambda n: get(m, int(n[1])) or "", repl).strip() or None + + +REPETITION_PATTERN = re.compile(r"\{(0|1)\s*,\s*\d{3,}\}") +CLASS_PATTERN = re.compile( + r""" +\[[^]]*\\(d|w)[^]]*\] +| +\\(d|w) +""", + re.VERBOSE, +) + + +def class_replacer(m: re.Match[str]) -> str: + d, w = ("0-9", "A-Za-z0-9_") if m[1] else ("[0-9]", "[A-Za-z0-9_]") + return m[0].replace(r"\d", d).replace(r"\w", w) + + +def fa_simplifier(pattern: str) -> str: + """uap-core makes significant use of large bounded repetitions, to + mitigate catastrophic backtracking. + + However this explodes the number of states (and thus graph size) + for finite automaton engines, which significantly increases their + memory use, and for those which use JITs it can exceed the JIT + threshold and force fallback to a slower engine (seems to be the + case for graal's TRegex). + """ + pattern = REPETITION_PATTERN.sub(lambda m: "*" if m[1] == "0" else "+", pattern) + return CLASS_PATTERN.sub(class_replacer, pattern) diff --git a/tests/test_caches.py b/tests/test_caches.py index d612520a..c4ff990b 100644 --- a/tests/test_caches.py +++ b/tests/test_caches.py @@ -1,17 +1,14 @@ from collections import OrderedDict +import pytest # type: ignore + from ua_parser import ( - BasicResolver, CachingResolver, - Device, Domain, - OS, Parser, PartialResult, - UserAgent, ) -from ua_parser.caching import Lru -from ua_parser.matchers import DeviceMatcher, OSMatcher, UserAgentMatcher +from ua_parser.caching import Lru, S3Fifo, Sieve def test_lru(): @@ -19,7 +16,9 @@ def test_lru(): popped LRU-first. """ cache = Lru(2) - p = Parser(CachingResolver(BasicResolver(([], [], [])), cache)) + p = Parser( + CachingResolver(lambda s, d: PartialResult(d, None, None, None, s), cache) + ) p.parse("a") p.parse("b") @@ -41,37 +40,36 @@ def test_lru(): ) -def test_backfill(): - """Tests that caches handle partial parsing correctly, by updating the - existing entry when new parts get parsed. +@pytest.mark.parametrize("cache", [Lru, S3Fifo, Sieve]) +def test_backfill(cache): + """Tests that caches handle partial parsing correctly, by updating + the existing entry when new parts get parsed, without evicting + still-fitting entries. """ - cache = Lru(2) - p = Parser( - CachingResolver( - BasicResolver( - ( - [UserAgentMatcher("(a)")], - [OSMatcher("(a)")], - [DeviceMatcher("(a)")], - ) - ), - cache, - ) - ) + misses = 0 + + def resolver(ua: str, domains: Domain, /) -> PartialResult: + nonlocal misses + misses += 1 + return PartialResult(domains, None, None, None, ua) + + p = Parser(CachingResolver(resolver, cache(10))) + # fill the cache first, no need to hit the entries twice because + # S3 waits until it needs space in the main cache before demotes + # (or promotes) from the probationary cache. + for s in map(str, range(9)): + p.parse(s) + assert misses == 9 + # add a partial entry p.parse_user_agent("a") - assert cache.cache == { - "a": PartialResult(Domain.USER_AGENT, UserAgent("a"), None, None, "a"), - } - p("a", Domain.OS) - assert cache.cache == { - "a": PartialResult( - Domain.USER_AGENT | Domain.OS, UserAgent("a"), OS("a"), None, "a" - ), - } - p.parse("a") - assert cache.cache == { - "a": PartialResult( - Domain.ALL, UserAgent("a"), OS("a"), Device("a", None, "a"), "a" - ), - } + # fill the partial entry, counts as a miss since it needs to + # resolve the new bit + p.parse_os("a") + assert misses == 11 + + misses = 0 + # check that the original entries are all hits + for s in map(str, range(9)): + p.parse(s) + assert misses == 0 diff --git a/tests/test_convenience_parser.py b/tests/test_convenience_parser.py index cf1d3609..86240613 100644 --- a/tests/test_convenience_parser.py +++ b/tests/test_convenience_parser.py @@ -1,6 +1,23 @@ +import ua_parser from ua_parser import Domain, Parser, PartialResult, Result +def test_parser_memoized() -> None: + """The global parser should be lazily instantiated but memoized""" + # ensure there is no global parser + vars(ua_parser).pop("parser", None) + + p1 = ua_parser.parser + p2 = ua_parser.parser + + assert p1 is p2 + + # force the creation of a clean parser + del ua_parser.parser + p3 = ua_parser.parser + assert p3 is not p1 + + def resolver(s: str, d: Domain) -> PartialResult: return PartialResult(d, None, None, None, s) diff --git a/tests/test_core.py b/tests/test_core.py index 4c801265..1a87702f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -3,25 +3,22 @@ import dataclasses import logging import pathlib -import platform from operator import attrgetter +from typing import cast import pytest # type: ignore -if platform.python_implementation() == "PyPy": - from yaml import SafeLoader, load -else: - try: - from yaml import ( # type: ignore - CSafeLoader as SafeLoader, - load, - ) - except ImportError: - logging.getLogger(__name__).warning( - "PyYaml C extension not available to run tests, this will result " - "in dramatic tests slowdown." - ) - from yaml import SafeLoader, load +try: + from yaml import ( + CSafeLoader as SafeLoader, + load, + ) +except ImportError: + logging.getLogger(__name__).warning( + "PyYaml C extension not available to run tests, this will result " + "in tests slowdown." + ) + from yaml import SafeLoader, load # type: ignore from ua_parser import ( BasicResolver, @@ -32,15 +29,22 @@ UserAgent, load_builtins, load_lazy_builtins, + loaders, ) from ua_parser.matchers import UserAgentMatcher CORE_DIR = (pathlib.Path(__name__).parent.parent / "uap-core").resolve() +data = cast(loaders.FileLoader, loaders.load_yaml)(CORE_DIR / "regexes.yaml") +data_lazy = cast(loaders.FileLoader, loaders.load_yaml)( + CORE_DIR / "regexes.yaml", loader=loaders.load_lazy +) PARSERS = [ pytest.param(Parser(BasicResolver(load_builtins())), id="basic"), pytest.param(Parser(BasicResolver(load_lazy_builtins())), id="lazy"), + pytest.param(Parser(BasicResolver(data)), id="basic-yaml"), + pytest.param(Parser(BasicResolver(data_lazy)), id="lazy-yaml"), ] try: from ua_parser import re2 @@ -51,7 +55,20 @@ ) ) else: - PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2")) + PARSERS.append(pytest.param(Parser(re2.Resolver(data)), id="re2")) + +try: + from ua_parser import regex +except ImportError: + PARSERS.append( + pytest.param( + None, + id="regex", + marks=pytest.mark.skip(reason="regex parser not available"), + ) + ) +else: + PARSERS.append(pytest.param(Parser(regex.Resolver(data)), id="regex")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} @@ -64,7 +81,7 @@ CORE_DIR / "test_resources" / "firefox_user_agent_strings.yaml", CORE_DIR / "test_resources" / "pgts_browser_list.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_ua(parser, test_file): with test_file.open("rb") as f: @@ -90,7 +107,7 @@ def test_ua(parser, test_file): CORE_DIR / "tests" / "test_os.yaml", CORE_DIR / "test_resources" / "additional_os_tests.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_os(parser, test_file): with test_file.open("rb") as f: @@ -111,7 +128,7 @@ def test_os(parser, test_file): [ CORE_DIR / "tests" / "test_device.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_devices(parser, test_file): with test_file.open("rb") as f: diff --git a/tests/test_fa_simplifier.py b/tests/test_fa_simplifier.py new file mode 100644 index 00000000..1c660509 --- /dev/null +++ b/tests/test_fa_simplifier.py @@ -0,0 +1,15 @@ +import pytest # type: ignore + +from ua_parser.utils import fa_simplifier + + +@pytest.mark.parametrize( + ("from_", "to"), + [ + (r"\d", "[0-9]"), + (r"[\d]", "[0-9]"), + (r"[\d\.]", r"[0-9\.]"), + ], +) +def test_classes(from_, to): + assert fa_simplifier(from_) == to diff --git a/tests/test_legacy.py b/tests/test_legacy.py index 7ada17c5..8fafbee6 100644 --- a/tests/test_legacy.py +++ b/tests/test_legacy.py @@ -107,18 +107,18 @@ def runUserAgentTestsFromYAML(self, file_name): result = {} result = user_agent_parser.ParseUserAgent(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1}, {2}, {3}, {4}> != actual<{5}, {6}, {7}, {8}>".format( - user_agent_string, - expected["family"], - expected["major"], - expected["minor"], - expected["patch"], - result["family"], - result["major"], - result["minor"], - result["patch"], + assert result == expected, ( + "UA: {0}\n expected<{1}, {2}, {3}, {4}> != actual<{5}, {6}, {7}, {8}>".format( + user_agent_string, + expected["family"], + expected["major"], + expected["minor"], + expected["patch"], + result["family"], + result["major"], + result["minor"], + result["patch"], + ) ) assert ( len(user_agent_parser._PARSE_CACHE) <= user_agent_parser.MAX_CACHE_SIZE @@ -143,20 +143,20 @@ def runOSTestsFromYAML(self, file_name): } result = user_agent_parser.ParseOS(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1} {2} {3} {4} {5}> != actual<{6} {7} {8} {9} {10}>".format( - user_agent_string, - expected["family"], - expected["major"], - expected["minor"], - expected["patch"], - expected["patch_minor"], - result["family"], - result["major"], - result["minor"], - result["patch"], - result["patch_minor"], + assert result == expected, ( + "UA: {0}\n expected<{1} {2} {3} {4} {5}> != actual<{6} {7} {8} {9} {10}>".format( + user_agent_string, + expected["family"], + expected["major"], + expected["minor"], + expected["patch"], + expected["patch_minor"], + result["family"], + result["major"], + result["minor"], + result["patch"], + result["patch_minor"], + ) ) def runDeviceTestsFromYAML(self, file_name): @@ -176,16 +176,16 @@ def runDeviceTestsFromYAML(self, file_name): } result = user_agent_parser.ParseDevice(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1} {2} {3}> != actual<{4} {5} {6}>".format( - user_agent_string, - expected["family"], - expected["brand"], - expected["model"], - result["family"], - result["brand"], - result["model"], + assert result == expected, ( + "UA: {0}\n expected<{1} {2} {3}> != actual<{4} {5} {6}>".format( + user_agent_string, + expected["family"], + expected["brand"], + expected["model"], + result["family"], + result["brand"], + result["model"], + ) ) diff --git a/tox.ini b/tox.ini index bb4af081..63cddcd9 100644 --- a/tox.ini +++ b/tox.ini @@ -1,15 +1,12 @@ [tox] min_version = 4.0 -env_list = py3{8,9,10,11,12} - pypy3.{8,9,10} - flake8, black, typecheck -labels = - test = py3{8,9,10,11,12},pypy3.{8,9,10} - cpy = py3{8,9,10,11,12} - pypy = pypy3.{8,9,10} - check = flake8, black, typecheck +env_list = py3{10,11,12,13,14,14t} + pypy{311} + graalpy3{11,12} + check, format, typecheck [testenv] +labels = test # wheel install package = wheel # wheel is universal so can use the same wheel for all envs @@ -19,28 +16,45 @@ wheel_build_env = .pkg deps = pytest pyyaml - google-re2 + ua-parser-rs + ./ua-parser-builtins commands = pytest -Werror --doctest-glob="*.rst" {posargs} -[testenv:pypy3.{8,9,10}] +[testenv:py3{10,11,12,13,14}] +labels = test, cpy deps = pytest pyyaml + google-re2 + ua-parser-rs + ./ua-parser-builtins -[testenv:flake8] +[testenv:check] +labels = check package = skip deps = ruff commands = ruff check {posargs} -[testenv:black] +[testenv:format] +description = Runs the formatter (just showing errors by default) +labels = check package = skip deps = ruff commands = ruff format {posargs:--diff} [testenv:typecheck] +labels = check package = skip deps = mypy types-PyYaml -commands = mypy {posargs:} + ./ua-parser-builtins +commands = mypy {posargs} + +[testenv:docs] +description = Builds the documentation +labels = +package = skip +deps = sphinx +commands = sphinx-build -M {posargs:html} doc docs/_build diff --git a/ua-parser-builtins/README.md b/ua-parser-builtins/README.md new file mode 100644 index 00000000..061f1c76 --- /dev/null +++ b/ua-parser-builtins/README.md @@ -0,0 +1,14 @@ +# Precompiled ruleset for [ua-parser] + +This project does not do anything on its own, nor does it have any +actual API: it contains the dataset of [uap-core] pre-compiled for use +by [ua-parser] to decrease +initialisation times. + +The precompiled ruleset is released monthly based on whatever +[uap-core]'s default branch is at that moment. The [uap-core] commit +used for creating the compiled ruleset is stored in the `REVISION` +file at the root of the wheel. + +[ua-parser]: https://pypi.org/project/ua-parser/ +[uap-core]: https://github.com/ua-parser/uap-core diff --git a/ua-parser-builtins/hatch_build.py b/ua-parser-builtins/hatch_build.py new file mode 100644 index 00000000..2a86012e --- /dev/null +++ b/ua-parser-builtins/hatch_build.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import io +import json +import os +import os.path +import tempfile +from contextlib import contextmanager +from typing import Any, Callable, ClassVar, Iterator, cast + +import yaml +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +from hatchling.metadata.plugin.interface import MetadataHookInterface +from versioningit import errors, get_version + + +class MetadataHook(MetadataHookInterface): + def update(self, metadata: dict[str, Any]) -> None: + try: + v = get_version( + os.path.join(self.root, "uap-core"), + config={ + "format": { + "distance": "{next_version}.dev{distance}", + } + }, + ) + except errors.NotSdistError: + with open(os.path.join(self.root, "uap-core", "package.json")) as ufile: + ujson = json.load(ufile) + v = ujson["version"] + if v in ("0.15.0", "0.16.0", "0.18.0"): + v = f"{v}.post1" + + metadata["version"] = v + + +class CompilerHook(BuildHookInterface): + def initialize( + self, + version: str, + build_data: dict[str, Any], + ) -> None: + rev = os.path.join(self.root, "uap-core/REVISION") + if os.path.exists(rev): + build_data["force_include"][rev] = "REVISION" + + with open(os.path.join(self.root, "uap-core/regexes.yaml"), "rb") as f: + data = yaml.safe_load(f) + + with ( + tempfile.NamedTemporaryFile(delete=False) as matchers, + tempfile.NamedTemporaryFile(delete=False) as lazy, + tempfile.NamedTemporaryFile(delete=False) as regexes, + ): + matchers_w = EagerWriter(cast(io.RawIOBase, matchers)) + lazy_w = LazyWriter(cast(io.RawIOBase, lazy)) + legacy_w = LegacyWriter(cast(io.RawIOBase, regexes)) + + for section, specs in data.items(): + with ( + matchers_w.section(section), + lazy_w.section(section), + legacy_w.section(section), + ): + extract = EXTRACTORS[section] + for s in specs: + el = trim(extract(s)) + matchers_w.item(el) + lazy_w.item(el) + legacy_w.item(el) + + matchers_w.end() + lazy_w.end() + legacy_w.end() + + build_data["force_include"][matchers.name] = "ua_parser_builtins/matchers.py" + build_data["force_include"][lazy.name] = "ua_parser_builtins/lazy.py" + build_data["force_include"][regexes.name] = "ua_parser_builtins/regexes.py" + + def finalize( + self, + version: str, + build_data: dict[str, Any], + artifact_path: str, + ): + tempdir = tempfile.gettempdir() + for k in build_data["force_include"]: + if k.startswith(tempdir): + os.remove(k) + + +def trim(items: list[str | None]) -> list[str | None]: + """Removes trailing `None` from the extraction""" + while len(items) > 1 and items[-1] is None: + items.pop() + return items + + +EXTRACTORS: dict[str, Callable[[dict[str, str]], list[str | None]]] = { + "user_agent_parsers": lambda p: [ + p["regex"], + p.get("family_replacement"), + p.get("v1_replacement"), + p.get("v2_replacement"), + p.get("v3_replacement"), + p.get("v4_replacement"), + ], + "os_parsers": lambda p: [ + p["regex"], + p.get("os_replacement"), + p.get("os_v1_replacement"), + p.get("os_v2_replacement"), + p.get("os_v3_replacement"), + p.get("os_v4_replacement"), + ], + "device_parsers": lambda p: [ + p["regex"], + p.get("regex_flag"), + p.get("device_replacement"), + p.get("brand_replacement"), + p.get("model_replacement"), + ], +} + + +class Writer: + items: ClassVar[dict[str, bytes]] + sections: ClassVar[dict[str, bytes]] + prefix: bytes + suffix = b"" + section_end = b"" + + def __init__(self, fp: io.RawIOBase) -> None: + self.fp = fp + self.fp.write( + b"""\ +######################################################## +# NOTICE: this file is autogenerated from regexes.yaml # +######################################################## +""" + ) + self.fp.write(self.prefix) + self._section: str | None = None + + @contextmanager + def section(self, id: str) -> Iterator[None]: + self._section = id + self.fp.write(self.sections[id]) + yield + self.fp.write(self.section_end) + + def item(self, elements: list[str | None]) -> None: + # DeviceMatcher(re, flag, repl1), + # assume we're in a section + self.fp.write(self.items[cast(str, self._section)]) + self.fp.write(", ".join(map(repr, elements)).encode()) + self.fp.write(b"),\n") + + def end(self) -> None: + self.fp.write(self.suffix) + + +class LegacyWriter(Writer): + prefix = b"""\ +__all__ = [ + "USER_AGENT_PARSERS", + "DEVICE_PARSERS", + "OS_PARSERS", +] + +from ua_parser.user_agent_parser import UserAgentParser, DeviceParser, OSParser + +""" + sections: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b"USER_AGENT_PARSERS = [\n", + "os_parsers": b"\n\nOS_PARSERS = [\n", + "device_parsers": b"\n\nDEVICE_PARSERS = [\n", + } + section_end = b"]" + items: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b" UserAgentParser(", + "os_parsers": b" OSParser(", + "device_parsers": b" DeviceParser(", + } + suffix = b"\n" + + +class EagerWriter(Writer): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from ua_parser.matchers import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" + sections: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b"", + "os_parsers": b"], [\n", + "device_parsers": b"], [\n", + } + items: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b" UserAgentMatcher(", + "os_parsers": b" OSMatcher(", + "device_parsers": b" DeviceMatcher(", + } + suffix = b"])\n" + + +class LazyWriter(EagerWriter): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from ua_parser.lazy import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" diff --git a/ua-parser-builtins/pyproject.toml b/ua-parser-builtins/pyproject.toml new file mode 100644 index 00000000..6fc800dc --- /dev/null +++ b/ua-parser-builtins/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["hatchling", "versioningit", "pyyaml"] +build-backend = "hatchling.build" + +[project] +name = "ua-parser-builtins" +description = "Precompiled rules for User Agent Parser" +readme = "README.md" +dependencies = [] +requires-python = ">=3.10" +license = {text = "Apache 2.0"} +urls = {repository = "https://github.com/ua-parser/uap-python"} +dynamic = ["version"] +maintainers = [ + { name = "masklinn", email = "uap@masklinn.net" } +] + +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: Implementation :: GraalPy", +] + +[tool.hatch.build.hooks.custom] + +[tool.hatch.metadata.hooks.custom] + +[tool.hatch.build.targets.sdist] +artifacts = [ + "uap-core/regexes.yaml", +] diff --git a/ua-parser-builtins/ua_parser_builtins/__init__.py b/ua-parser-builtins/ua_parser_builtins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ua-parser-builtins/ua_parser_builtins/py.typed b/ua-parser-builtins/ua_parser_builtins/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/ua-parser-builtins/uap-core b/ua-parser-builtins/uap-core new file mode 120000 index 00000000..fbefe368 --- /dev/null +++ b/ua-parser-builtins/uap-core @@ -0,0 +1 @@ +../uap-core \ No newline at end of file