diff --git a/.gitignore b/.gitignore index 1f955509..176d3696 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist/ tmp/ regexes.yaml _regexes.py +venv \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index fd375d28..14f33488 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = uap-core url = https://github.com/ua-parser/uap-core branch = master +[submodule "uap-cpp"] + path = uap-cpp + url = https://github.com/ua-parser/uap-cpp + branch = master diff --git a/MANIFEST.in b/MANIFEST.in index 9c004de5..244352c7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,4 +6,6 @@ include README.rst include LICENSE graft uap-core exclude uap-core/.* +exclude uap-cpp/.* recursive-exclude uap-core *.js +recursive-exclude uap-cpp *.cpp diff --git a/README.rst b/README.rst index 05139388..328769bd 100644 --- a/README.rst +++ b/README.rst @@ -1,119 +1,7 @@ -uap-python +uap-python-cpp ========== +I added the use of the uap-cpp library to the base. +Performance has increased by a factor of 10x A python implementation of the UA Parser (https://github.com/ua-parser, formerly https://github.com/tobie/ua-parser) - -Build Status ------------- - -.. image:: https://github.com/ua-parser/uap-python/actions/workflows/ci.yml/badge.svg - :alt: CI on the master branch - - -Installing ----------- - -Install via pip -~~~~~~~~~~~~~~~ - -Just run: - -.. code-block:: sh - - $ pip install ua-parser - -Manual install -~~~~~~~~~~~~~~ - -In the top-level directory run: - -.. code-block:: sh - - $ python setup.py install - -Change Log ---------------- -Because this repo is mostly a python wrapper for the User Agent String Parser repo (https://github.com/ua-parser/uap-core), the changes made to this repo are best described by the update diffs in that project. Please see the diffs for this submodule (https://github.com/ua-parser/uap-core/releases) for a list of what has changed between versions of this package. - -Getting Started ---------------- - -Retrieve data on a user-agent string -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> from ua_parser import user_agent_parser - >>> import pprint - >>> pp = pprint.PrettyPrinter(indent=4) - >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36' - >>> parsed_string = user_agent_parser.Parse(ua_string) - >>> pp.pprint(parsed_string) - { 'device': {'brand': 'Apple', 'family': 'Mac', 'model': 'Mac'}, - 'os': { 'family': 'Mac OS X', - 'major': '10', - 'minor': '9', - 'patch': '4', - 'patch_minor': None}, - 'string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 ' - 'Safari/537.36', - 'user_agent': { 'family': 'Chrome', - 'major': '41', - 'minor': '0', - 'patch': '2272'}} - -Extract browser data from user-agent string -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> from ua_parser import user_agent_parser - >>> import pprint - >>> pp = pprint.PrettyPrinter(indent=4) - >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36' - >>> parsed_string = user_agent_parser.ParseUserAgent(ua_string) - >>> pp.pprint(parsed_string) - {'family': 'Chrome', 'major': '41', 'minor': '0', 'patch': '2272'} - -.. - - ⚠️Before 0.15, the convenience parsers (``ParseUserAgent``, - ``ParseOs``, and ``ParseDevice``) were not cached, which could - result in degraded performances when parsing large amounts of - identical user-agents (which might occur for real-world datasets). - - For these versions (up to 0.10 included), prefer using ``Parse`` - and extracting the sub-component you need from the resulting - dictionary. - -Extract OS information from user-agent string -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> from ua_parser import user_agent_parser - >>> import pprint - >>> pp = pprint.PrettyPrinter(indent=4) - >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36' - >>> parsed_string = user_agent_parser.ParseOS(ua_string) - >>> pp.pprint(parsed_string) - { 'family': 'Mac OS X', - 'major': '10', - 'minor': '9', - 'patch': '4', - 'patch_minor': None} - -Extract Device information from user-agent string -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - >>> from ua_parser import user_agent_parser - >>> import pprint - >>> pp = pprint.PrettyPrinter(indent=4) - >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36' - >>> parsed_string = user_agent_parser.ParseDevice(ua_string) - >>> pp.pprint(parsed_string) - {'brand': 'Apple', 'family': 'Mac', 'model': 'Mac'} diff --git a/pyproject.toml b/pyproject.toml index 48b6aa91..b2511afa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,17 @@ [build-system] -requires = ["setuptools", "setuptools-scm", "PyYaml"] +requires = [ + "setuptools", + "pybind11", +] build-backend = "setuptools.build_meta" [project] name = "ua-parser" description = "Python port of Browserscope's user agent parser" -version = "1.0.0a" +version = "0.18.3" readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} @@ -26,19 +28,19 @@ maintainers = [ ] classifiers = [ - "Development Status :: 4 - Beta", - "Environment :: Web Environment", - "Intended Audience :: Developers", - "Operating System :: OS Independent", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python", - "Topic :: Internet :: WWW/HTTP", - "Topic :: Software Development :: Libraries :: Python Modules", - "Programming Language :: Python", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy" + "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy" ] diff --git a/setup.cfg b/setup.cfg index 9b07aee0..04eee57d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,6 @@ packages = find: package_dir = =src -setup_requires = pyyaml [options.packages.find] where = src diff --git a/setup.py b/setup.py index 0e14118c..11a2e5ed 100644 --- a/setup.py +++ b/setup.py @@ -1,21 +1,31 @@ #!/usr/bin/env python # flake8: noqa -from contextlib import suppress -from os import fspath +from os import path +from shutil import copyfile from pathlib import Path +from subprocess import check_call from typing import Optional, List, Dict - -from setuptools import setup, Command, find_namespace_packages +from setuptools import setup, Command from setuptools.command.build import build, SubCommand -from setuptools.command.editable_wheel import editable_wheel +from pybind11.setup_helpers import Pybind11Extension, build_ext +from setuptools.command.install import install as _install + + +build.sub_commands.insert(0, ("compile_uap_cpp", None)) -import yaml +class CustomInstallCommand(_install): + def run(self): + _install.run(self) -build.sub_commands.insert(0, ("compile-regexes", None)) + # после установки копируем файл regexes.yaml + source_file = Path('uap-core/regexes.yaml') + target_file = Path(self.install_lib, 'ua_parser/regexes.yaml') + if source_file.is_file(): + copyfile(source_file, target_file) -class CompileRegexes(Command, SubCommand): +class CompileUapCpp(Command, SubCommand): def initialize_options(self) -> None: self.pkg_name: Optional[str] = None @@ -23,111 +33,37 @@ def finalize_options(self) -> None: self.pkg_name = self.distribution.get_name().replace("-", "_") def get_source_files(self) -> List[str]: - return ["uap-core/regexes.yaml"] + return ["uap-cpp/libuaparser_cpp.a"] def get_outputs(self) -> List[str]: - return [f"{self.pkg_name}/_regexes.py"] + return [f"{self.pkg_name}/libuaparser_cpp.a"] def get_output_mapping(self) -> Dict[str, str]: return dict(zip(self.get_source_files(), self.get_outputs())) def run(self) -> None: - # FIXME: check git / submodules? - """ - work_path = self.work_path - if not os.path.exists(os.path.join(work_path, ".git")): - return - - log.info("initializing git submodules") - check_output(["git", "submodule", "init"], cwd=work_path) - check_output(["git", "submodule", "update"], cwd=work_path) - """ - if not self.pkg_name: - return # or error? - - yaml_src = Path("uap-core", "regexes.yaml") - if not yaml_src.is_file(): - raise RuntimeError( - f"Unable to find regexes.yaml, should be at {yaml_src!r}" - ) - - def write_params(fields): - # strip trailing None values - while len(fields) > 1 and fields[-1] is None: - fields.pop() - - for field in fields: - fp.write((f" {field!r},\n").encode()) - - with yaml_src.open("rb") as f: - regexes = yaml.safe_load(f) - - if self.editable_mode: - dist_dir = Path("src") - else: - dist_dir = Path(self.get_finalized_command("bdist_wheel").bdist_dir) - - outdir = dist_dir / self.pkg_name - outdir.mkdir(parents=True, exist_ok=True) - - dest = outdir / "_regexes.py" + if not path.exists('uap-cpp/libuaparser_cpp.a'): + check_call(['make', 'uaparser_cpp'], cwd='uap-cpp') - with dest.open("wb") as fp: - # fmt: off - fp.write(b"# -*- coding: utf-8 -*-\n") - fp.write(b"########################################################\n") - fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n") - fp.write(b"########################################################\n") - fp.write(b"\n") - fp.write(b"from .user_agent_parser import (\n") - fp.write(b" UserAgentParser, DeviceParser, OSParser,\n") - fp.write(b")\n") - fp.write(b"\n") - fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n") - fp.write(b"\n") - fp.write(b"USER_AGENT_PARSERS = [\n") - for device_parser in regexes["user_agent_parsers"]: - fp.write(b" UserAgentParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("family_replacement"), - device_parser.get("v1_replacement"), - device_parser.get("v2_replacement"), - ]) - fp.write(b" ),\n") - fp.write(b"]\n") - fp.write(b"\n") - fp.write(b"DEVICE_PARSERS = [\n") - for device_parser in regexes["device_parsers"]: - fp.write(b" DeviceParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("regex_flag"), - device_parser.get("device_replacement"), - device_parser.get("brand_replacement"), - device_parser.get("model_replacement"), - ]) - fp.write(b" ),\n") - fp.write(b"]\n") - fp.write(b"\n") - fp.write(b"OS_PARSERS = [\n") - for device_parser in regexes["os_parsers"]: - fp.write(b" OSParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("os_replacement"), - device_parser.get("os_v1_replacement"), - device_parser.get("os_v2_replacement"), - device_parser.get("os_v3_replacement"), - device_parser.get("os_v4_replacement"), - ]) - fp.write(b" ),\n") - fp.write(b"]\n") - # fmt: on +ext_modules = [ + Pybind11Extension( + "ua_parser.pyuapcpp", + ["src/main.cpp"], + include_dirs=['uap-cpp'], + extra_compile_args=["-std=c++14"], + extra_objects=['uap-cpp/libuaparser_cpp.a'], + extra_link_args=['uap-cpp/libuaparser_cpp.a', '-lre2', '-lyaml-cpp'], + ), +] setup( cmdclass={ - "compile-regexes": CompileRegexes, - } + "install": CustomInstallCommand, + "compile_uap_cpp": CompileUapCpp, + "build_ext": build_ext + }, + include_package_data=True, + ext_modules=ext_modules, + zip_safe=False ) diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 00000000..d00e27ad --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,59 @@ +#include +#include +#include "UaParser.h" + + +namespace py = pybind11; + + +std::shared_ptr p; +bool is_parser_initialized = false; + +void init_parser(const std::string& regex_file_path) { + if (!is_parser_initialized) { + p = std::make_shared(regex_file_path); + is_parser_initialized = true; + } +} + +uap_cpp::UserAgent parse(const std::string& ua) { + if (!is_parser_initialized) { + throw std::runtime_error("UserAgentParser not initialized. Call init_parser first."); + } + return p->parse(ua); +} + +PYBIND11_MODULE(pyuapcpp, m) { + m.doc() = "Python wrapper for the uap-cpp library"; + + m.def("init_parser", &init_parser, "Initialize the parser with the regex file path"); + m.def("parse", &parse, "Parse a user agent string"); + + py::class_(m, "Generic") + .def(py::init<>()) + .def_readwrite("family", &uap_cpp::Generic::family); + + py::class_(m, "Agent") + .def(py::init<>()) + .def("toString", &uap_cpp::Agent::toString) + .def("toVersionString", &uap_cpp::Agent::toVersionString) + .def_readwrite("family", &uap_cpp::Agent::family) + .def_readwrite("major", &uap_cpp::Agent::major) + .def_readwrite("minor", &uap_cpp::Agent::minor) + .def_readwrite("patch", &uap_cpp::Agent::patch) + .def_readwrite("patch_minor", &uap_cpp::Agent::patch_minor); + + py::class_(m, "Device") + .def(py::init<>()) + .def_readwrite("family", &uap_cpp::Device::family) + .def_readwrite("model", &uap_cpp::Device::model) + .def_readwrite("brand", &uap_cpp::Device::brand); + + py::class_(m, "UserAgent") + .def(py::init<>()) + .def("toFullString", &uap_cpp::UserAgent::toFullString) + .def("isSpider", &uap_cpp::UserAgent::isSpider) + .def_readwrite("device", &uap_cpp::UserAgent::device) + .def_readwrite("os", &uap_cpp::UserAgent::os) + .def_readwrite("browser", &uap_cpp::UserAgent::browser); +} \ No newline at end of file diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index f1c0a2a2..4592df01 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -1 +1,7 @@ -VERSION = (0, 16, 1) +import os +from . import pyuapcpp + +regex_file_path = os.path.dirname(os.path.abspath(__file__)) + '/regexes.yaml' + +# Инициализация парсера при импорте модуля +pyuapcpp.init_parser(regex_file_path) \ No newline at end of file diff --git a/src/ua_parser/user_agent_parser.py b/src/ua_parser/user_agent_parser.py index 4cea3620..bf7c86fa 100644 --- a/src/ua_parser/user_agent_parser.py +++ b/src/ua_parser/user_agent_parser.py @@ -1,515 +1,25 @@ -import os -import re -import warnings -from typing import * +from . import pyuapcpp -class UserAgentParser(object): - def __init__( - self, pattern, family_replacement=None, v1_replacement=None, v2_replacement=None - ): - """Initialize UserAgentParser. - - Args: - pattern: a regular expression string - family_replacement: a string to override the matched family (optional) - v1_replacement: a string to override the matched v1 (optional) - v2_replacement: a string to override the matched v2 (optional) - """ - self.user_agent_re = re.compile(pattern) - self.family_replacement = family_replacement - self.v1_replacement = v1_replacement - self.v2_replacement = v2_replacement - - def Parse( - self, user_agent_string: str - ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str],]: - family, v1, v2, v3 = None, None, None, None - match = self.user_agent_re.search(user_agent_string) - if match: - if self.family_replacement: - if re.search(r"\$1", self.family_replacement): - family = re.sub(r"\$1", match[1], self.family_replacement) - else: - family = self.family_replacement - else: - family = match[1] - - if self.v1_replacement: - v1 = self.v1_replacement - elif match.lastindex and match.lastindex >= 2: - v1 = match[2] or None - - if self.v2_replacement: - v2 = self.v2_replacement - elif match.lastindex and match.lastindex >= 3: - v2 = match[3] or None - - if match.lastindex and match.lastindex >= 4: - v3 = match[4] or None - - return family, v1, v2, v3 - - -class OSParser(object): - def __init__( - self, - pattern, - os_replacement=None, - os_v1_replacement=None, - os_v2_replacement=None, - os_v3_replacement=None, - os_v4_replacement=None, - ): - """Initialize UserAgentParser. - - Args: - pattern: a regular expression string - os_replacement: a string to override the matched os (optional) - os_v1_replacement: a string to override the matched v1 (optional) - os_v2_replacement: a string to override the matched v2 (optional) - os_v3_replacement: a string to override the matched v3 (optional) - os_v4_replacement: a string to override the matched v4 (optional) - """ - self.user_agent_re = re.compile(pattern) - self.os_replacement = os_replacement - self.os_v1_replacement = os_v1_replacement - self.os_v2_replacement = os_v2_replacement - self.os_v3_replacement = os_v3_replacement - self.os_v4_replacement = os_v4_replacement - - def Parse( - self, user_agent_string: str - ) -> Tuple[ - Optional[str], - Optional[str], - Optional[str], - Optional[str], - Optional[str], - ]: - os, os_v1, os_v2, os_v3, os_v4 = None, None, None, None, None - match = self.user_agent_re.search(user_agent_string) - if match: - if self.os_replacement: - os = MultiReplace(self.os_replacement, match) - elif match.lastindex: - os = match[1] - - if self.os_v1_replacement: - os_v1 = MultiReplace(self.os_v1_replacement, match) - elif match.lastindex and match.lastindex >= 2: - os_v1 = match[2] - - if self.os_v2_replacement: - os_v2 = MultiReplace(self.os_v2_replacement, match) - elif match.lastindex and match.lastindex >= 3: - os_v2 = match[3] - - if self.os_v3_replacement: - os_v3 = MultiReplace(self.os_v3_replacement, match) - elif match.lastindex and match.lastindex >= 4: - os_v3 = match[4] - - if self.os_v4_replacement: - os_v4 = MultiReplace(self.os_v4_replacement, match) - elif match.lastindex and match.lastindex >= 5: - os_v4 = match[5] - - return os, os_v1, os_v2, os_v3, os_v4 - - -def MultiReplace(string, match): - def _repl(m): - index = int(m[1]) - 1 - group = match.groups() - if index < len(group): - return group[index] - return "" - - _string = re.sub(r"\$(\d)", _repl, string).strip() - return _string or None - - -class DeviceParser(object): - def __init__( - self, - pattern, - regex_flag=None, - device_replacement=None, - brand_replacement=None, - model_replacement=None, - ): - """Initialize UserAgentParser. - - Args: - pattern: a regular expression string - device_replacement: a string to override the matched device (optional) - """ - self.user_agent_re = re.compile( - pattern, re.IGNORECASE if regex_flag == "i" else 0 - ) - self.device_replacement = device_replacement - self.brand_replacement = brand_replacement - self.model_replacement = model_replacement - - def Parse( - self, user_agent_string: str - ) -> Tuple[Optional[str], Optional[str], Optional[str],]: - device, brand, model = None, None, None - match = self.user_agent_re.search(user_agent_string) - if match: - if self.device_replacement: - device = MultiReplace(self.device_replacement, match) - else: - device = match[1] - - if self.brand_replacement: - brand = MultiReplace(self.brand_replacement, match) - - if self.model_replacement: - model = MultiReplace(self.model_replacement, match) - elif len(match.groups()) > 0: - model = match[1] - - return device, brand, model - - -MAX_CACHE_SIZE = 200 -_PARSE_CACHE: Dict[str, Dict[str, Any]] = {} - - -def _lookup(ua: str): - if not isinstance(ua, str): - raise TypeError(f"Expected user agent to be a string, got {ua!r}") - - entry = _PARSE_CACHE.get(ua) - if entry is not None: - return entry - - if len(_PARSE_CACHE) >= MAX_CACHE_SIZE: - _PARSE_CACHE.clear() - - v = _PARSE_CACHE[ua] = {"string": ua} - return v - - -def _cached(ua, key, fn): - entry = _lookup(ua) - r = entry.get(key) - if not r: - r = entry[key] = fn(ua) - return r - - -def Parse(user_agent_string: str, **_jsParseBits): - """Parse all the things - Args: - user_agent_string: the full user agent string - Returns: - A dictionary containing all parsed bits - """ - if _jsParseBits: - warnings.warn( - "javascript overrides are not used anymore", - category=DeprecationWarning, - stacklevel=2, - ) - - entry = _lookup(user_agent_string) - # entry is complete, return directly - if len(entry) == 4: - return entry - - # entry is partially or entirely empty - if "user_agent" not in entry: - entry["user_agent"] = _ParseUserAgent(user_agent_string) - if "os" not in entry: - entry["os"] = _ParseOS(user_agent_string) - if "device" not in entry: - entry["device"] = _ParseDevice(user_agent_string) - - return entry - - -def ParseUserAgent(user_agent_string, **_jsParseBits): - """Parses the user-agent string for user agent (browser) info. - Args: - user_agent_string: The full user-agent string. - Returns: - A dictionary containing parsed bits. - """ - if _jsParseBits: - warnings.warn( - "javascript overrides are not used anymore", - category=DeprecationWarning, - stacklevel=2, - ) - return _cached(user_agent_string, "user_agent", _ParseUserAgent) - - -def _ParseUserAgent(user_agent_string): - for uaParser in USER_AGENT_PARSERS: - family, v1, v2, v3 = uaParser.Parse(user_agent_string) - if family: - break - - family = family or "Other" +def Parse(user_agent_string: str, **kwargs): + ua = pyuapcpp.parse(user_agent_string) return { - "family": family, - "major": v1 or None, - "minor": v2 or None, - "patch": v3 or None, - } - - -def ParseOS(user_agent_string, **_jsParseBits): - """Parses the user-agent string for operating system info - Args: - user_agent_string: The full user-agent string. - Returns: - A dictionary containing parsed bits. - """ - if _jsParseBits: - warnings.warn( - "javascript overrides are not used anymore", - category=DeprecationWarning, - stacklevel=2, - ) - return _cached(user_agent_string, "os", _ParseOS) - - -def _ParseOS(user_agent_string): - for osParser in OS_PARSERS: - os, os_v1, os_v2, os_v3, os_v4 = osParser.Parse(user_agent_string) - if os: - break - os = os or "Other" - return { - "family": os, - "major": os_v1, - "minor": os_v2, - "patch": os_v3, - "patch_minor": os_v4, - } - - -def ParseDevice(user_agent_string, **_jsParseBits): - """Parses the user-agent string for device info. - Args: - user_agent_string: The full user-agent string. - Returns: - A dictionary containing parsed bits. - """ - if _jsParseBits: - warnings.warn( - "javascript overrides are not used anymore", - category=DeprecationWarning, - stacklevel=2, - ) - return _cached(user_agent_string, "device", _ParseDevice) - - -def _ParseDevice(user_agent_string): - for deviceParser in DEVICE_PARSERS: - device, brand, model = deviceParser.Parse(user_agent_string) - if device: - break - - if device is None: - device = "Other" - - return {"family": device, "brand": brand, "model": model} - - -def PrettyUserAgent(family, v1=None, v2=None, v3=None): - """Pretty user agent string.""" - if v3: - if v3[0].isdigit(): - return f"{family} {v1}.{v2}.{v3}" - else: - return f"{family} {v1}.{v2}{v3}" - elif v2: - return f"{family} {v1}.{v2}" - elif v1: - return f"{family} {v1}" - return family - - -def PrettyOS(os, os_v1=None, os_v2=None, os_v3=None, os_v4=None): - """Pretty os string.""" - if os_v4: - return f"{os} {os_v1}.{os_v2}.{os_v3}.{os_v4}" - if os_v3: - if os_v3[0].isdigit(): - return f"{os} {os_v1}.{os_v2}.{os_v3}" - else: - return f"{os} {os_v1}.{os_v2}{os_v3}" - elif os_v2: - return f"{os} {os_v1}.{os_v2}" - elif os_v1: - return f"{os} {os_v1}" - return os - - -def ParseWithJSOverrides( - user_agent_string, - js_user_agent_string=None, - js_user_agent_family=None, - js_user_agent_v1=None, - js_user_agent_v2=None, - js_user_agent_v3=None, -): - warnings.warn( - "Use Parse (or a specialised parser)", DeprecationWarning, stacklevel=2 - ) - - # Override via JS properties. - if js_user_agent_family is not None and js_user_agent_family != "": - family = js_user_agent_family - v1 = None - v2 = None - v3 = None - if js_user_agent_v1 is not None: - v1 = js_user_agent_v1 - if js_user_agent_v2 is not None: - v2 = js_user_agent_v2 - if js_user_agent_v3 is not None: - v3 = js_user_agent_v3 - else: - for parser in USER_AGENT_PARSERS: - family, v1, v2, v3 = parser.Parse(user_agent_string) - if family: - break - - # Override for Chrome Frame IFF Chrome is enabled. - if ( - js_user_agent_string - and js_user_agent_string.find("Chrome/") > -1 - and user_agent_string.find("chromeframe") > -1 - ): - family = "Chrome Frame (%s %s)" % (family, v1) - ua_dict = ParseUserAgent(js_user_agent_string) - v1 = ua_dict["major"] - v2 = ua_dict["minor"] - v3 = ua_dict["patch"] - - return family or "Other", v1, v2, v3 - - -def Pretty(family, v1=None, v2=None, v3=None): - warnings.warn("Use PrettyUserAgent", DeprecationWarning, stacklevel=2) - if v3: - if v3[0].isdigit(): - return f"{family} {v1}.{v2}.{v3}" - else: - return f"{family} {v1}.{v2}{v3}" - elif v2: - return f"{family} {v1}.{v2}" - elif v1: - return f"{family} {v1}" - return family - - -def GetFilters( - user_agent_string, - js_user_agent_string=None, - js_user_agent_family=None, - js_user_agent_v1=None, - js_user_agent_v2=None, - js_user_agent_v3=None, -): - warnings.warn("No use case anymore", DeprecationWarning, stacklevel=2) - - filters = {} - filterdict = { - "js_user_agent_string": js_user_agent_string, - "js_user_agent_family": js_user_agent_family, - "js_user_agent_v1": js_user_agent_v1, - "js_user_agent_v2": js_user_agent_v2, - "js_user_agent_v3": js_user_agent_v3, - } - for key, value in filterdict.items(): - if value is not None and value != "": - filters[key] = value - return filters - - -# Build the list of user agent parsers from YAML -UA_PARSER_YAML = os.environ.get("UA_PARSER_YAML") -if UA_PARSER_YAML: - # This will raise an ImportError if missing, obviously since it's no - # longer a requirement - import yaml - - try: - # Try and use libyaml bindings if available since faster, - # pyyaml doesn't do it by default (yaml/pyyaml#436) - from yaml import CSafeLoader as SafeLoader - except ImportError: - from yaml import SafeLoader # type: ignore - - with open(UA_PARSER_YAML, "rb") as fp: - regexes = yaml.load(fp, Loader=SafeLoader) - - USER_AGENT_PARSERS = [] - for _ua_parser in regexes["user_agent_parsers"]: - _regex = _ua_parser["regex"] - - _family_replacement = _ua_parser.get("family_replacement") - _v1_replacement = _ua_parser.get("v1_replacement") - _v2_replacement = _ua_parser.get("v2_replacement") - - USER_AGENT_PARSERS.append( - UserAgentParser( - _regex, _family_replacement, _v1_replacement, _v2_replacement - ) - ) - - OS_PARSERS = [] - for _os_parser in regexes["os_parsers"]: - _regex = _os_parser["regex"] - - _os_replacement = _os_parser.get("os_replacement") - _os_v1_replacement = _os_parser.get("os_v1_replacement") - _os_v2_replacement = _os_parser.get("os_v2_replacement") - _os_v3_replacement = _os_parser.get("os_v3_replacement") - _os_v4_replacement = _os_parser.get("os_v4_replacement") - - OS_PARSERS.append( - OSParser( - _regex, - _os_replacement, - _os_v1_replacement, - _os_v2_replacement, - _os_v3_replacement, - _os_v4_replacement, - ) - ) - - DEVICE_PARSERS = [] - for _device_parser in regexes["device_parsers"]: - _regex = _device_parser["regex"] - - _regex_flag = _device_parser.get("regex_flag") - _device_replacement = _device_parser.get("device_replacement") - _brand_replacement = _device_parser.get("brand_replacement") - _model_replacement = _device_parser.get("model_replacement") - - DEVICE_PARSERS.append( - DeviceParser( - _regex, - _regex_flag, - _device_replacement, - _brand_replacement, - _model_replacement, - ) - ) - - # Clean our our temporary vars explicitly - # so they can't be reused or imported - del regexes - del yaml - del SafeLoader -else: - # Just load our pre-compiled versions - from ._regexes import USER_AGENT_PARSERS, DEVICE_PARSERS, OS_PARSERS + 'user_agent': { + "family": ua.browser.family, + "major": ua.browser.major or None, + "minor": ua.browser.minor or None, + "patch": ua.browser.patch or None, + }, + 'os': { + "family": ua.os.family, + "major": ua.os.major or None, + "minor": ua.os.minor or None, + "patch": ua.os.patch or None, + "patch_minor": ua.os.patch_minor or None, + }, + 'device': { + "family": ua.device.family or None, + "brand": ua.device.brand or None, + "model": ua.device.model or None + } + } \ No newline at end of file diff --git a/tests/test_legacy.py b/tests/test_legacy.py deleted file mode 100644 index 03feeda3..00000000 --- a/tests/test_legacy.py +++ /dev/null @@ -1,247 +0,0 @@ -import logging -import os -import platform -import sys -import warnings - -import pytest -import yaml - -if platform.python_implementation() == "PyPy": - from yaml import SafeLoader -else: - try: - from yaml import CSafeLoader as SafeLoader - except ImportError: - logging.getLogger(__name__).warning( - "PyYaml C extension not available to run tests, this will result " - "in dramatic tests slowdown." - ) - from yaml import SafeLoader - - -from ua_parser import user_agent_parser - -TEST_RESOURCES_DIR = os.path.join( - os.path.abspath(os.path.dirname(__file__)), "../uap-core" -) - - -class TestParse: - def testBrowserscopeStrings(self): - self.runUserAgentTestsFromYAML( - os.path.join(TEST_RESOURCES_DIR, "tests/test_ua.yaml") - ) - - def testBrowserscopeStringsOS(self): - self.runOSTestsFromYAML(os.path.join(TEST_RESOURCES_DIR, "tests/test_os.yaml")) - - def testStringsOS(self): - self.runOSTestsFromYAML( - os.path.join(TEST_RESOURCES_DIR, "test_resources/additional_os_tests.yaml") - ) - - def testStringsDevice(self): - self.runDeviceTestsFromYAML( - os.path.join(TEST_RESOURCES_DIR, "tests/test_device.yaml") - ) - - def testMozillaStrings(self): - self.runUserAgentTestsFromYAML( - os.path.join( - TEST_RESOURCES_DIR, "test_resources/firefox_user_agent_strings.yaml" - ) - ) - - # NOTE: The YAML file used here is one output by makePGTSComparisonYAML() - # below, as opposed to the pgts_browser_list-orig.yaml file. The -orig - # file is by no means perfect, but identifies many browsers that we - # classify as "Other". This test itself is mostly useful to know when - # somthing in UA parsing changes. An effort should be made to try and - # reconcile the differences between the two YAML files. - def testPGTSStrings(self): - self.runUserAgentTestsFromYAML( - os.path.join(TEST_RESOURCES_DIR, "test_resources/pgts_browser_list.yaml") - ) - - def testParseAll(self): - user_agent_string = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; fr; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5,gzip(gfe),gzip(gfe)" - expected = { - "device": {"family": "Mac", "brand": "Apple", "model": "Mac"}, - "os": { - "family": "Mac OS X", - "major": "10", - "minor": "4", - "patch": None, - "patch_minor": None, - }, - "user_agent": { - "family": "Firefox", - "major": "3", - "minor": "5", - "patch": "5", - }, - "string": user_agent_string, - } - - result = user_agent_parser.Parse(user_agent_string) - assert result == expected, "UA: {0}\n expected<{1}> != actual<{2}>".format( - user_agent_string, expected, result - ) - - # Run a set of test cases from a YAML file - def runUserAgentTestsFromYAML(self, file_name): - yamlFile = open(os.path.join(TEST_RESOURCES_DIR, file_name)) - yamlContents = yaml.load(yamlFile, Loader=SafeLoader) - yamlFile.close() - - for test_case in yamlContents["test_cases"]: - # Inputs to Parse() - user_agent_string = test_case["user_agent_string"] - - # The expected results - expected = { - "family": test_case["family"], - "major": test_case["major"], - "minor": test_case["minor"], - "patch": test_case["patch"], - } - - result = {} - result = user_agent_parser.ParseUserAgent(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1}, {2}, {3}, {4}> != actual<{5}, {6}, {7}, {8}>".format( - user_agent_string, - expected["family"], - expected["major"], - expected["minor"], - expected["patch"], - result["family"], - result["major"], - result["minor"], - result["patch"], - ) - assert ( - len(user_agent_parser._PARSE_CACHE) <= user_agent_parser.MAX_CACHE_SIZE - ), "verify that the cache size never exceeds the configured setting" - - def runOSTestsFromYAML(self, file_name): - yamlFile = open(os.path.join(TEST_RESOURCES_DIR, file_name)) - yamlContents = yaml.load(yamlFile, Loader=SafeLoader) - yamlFile.close() - - for test_case in yamlContents["test_cases"]: - # Inputs to Parse() - user_agent_string = test_case["user_agent_string"] - - # The expected results - expected = { - "family": test_case["family"], - "major": test_case["major"], - "minor": test_case["minor"], - "patch": test_case["patch"], - "patch_minor": test_case["patch_minor"], - } - - result = user_agent_parser.ParseOS(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1} {2} {3} {4} {5}> != actual<{6} {7} {8} {9} {10}>".format( - user_agent_string, - expected["family"], - expected["major"], - expected["minor"], - expected["patch"], - expected["patch_minor"], - result["family"], - result["major"], - result["minor"], - result["patch"], - result["patch_minor"], - ) - - def runDeviceTestsFromYAML(self, file_name): - yamlFile = open(os.path.join(TEST_RESOURCES_DIR, file_name)) - yamlContents = yaml.load(yamlFile, Loader=SafeLoader) - yamlFile.close() - - for test_case in yamlContents["test_cases"]: - # Inputs to Parse() - user_agent_string = test_case["user_agent_string"] - - # The expected results - expected = { - "family": test_case["family"], - "brand": test_case["brand"], - "model": test_case["model"], - } - - result = user_agent_parser.ParseDevice(user_agent_string) - assert ( - result == expected - ), "UA: {0}\n expected<{1} {2} {3}> != actual<{4} {5} {6}>".format( - user_agent_string, - expected["family"], - expected["brand"], - expected["model"], - result["family"], - result["brand"], - result["model"], - ) - - -class TestGetFilters: - def testGetFiltersNoMatchesGiveEmptyDict(self): - user_agent_string = "foo" - with pytest.warns(DeprecationWarning): - filters = user_agent_parser.GetFilters( - user_agent_string, js_user_agent_string=None - ) - assert {} == filters - - def testGetFiltersJsUaPassedThrough(self): - user_agent_string = "foo" - with pytest.warns(DeprecationWarning): - filters = user_agent_parser.GetFilters( - user_agent_string, js_user_agent_string="bar" - ) - assert {"js_user_agent_string": "bar"} == filters - - def testGetFiltersJsUserAgentFamilyAndVersions(self): - user_agent_string = ( - "Mozilla/4.0 (compatible; MSIE 8.0; " - "Windows NT 5.1; Trident/4.0; GTB6; .NET CLR 2.0.50727; " - ".NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)" - ) - with pytest.warns(DeprecationWarning): - filters = user_agent_parser.GetFilters( - user_agent_string, - js_user_agent_string="bar", - js_user_agent_family="foo", - ) - assert {"js_user_agent_string": "bar", "js_user_agent_family": "foo"} == filters - - -class TestDeprecationWarnings: - def test_parser_deprecation(self): - with pytest.warns(DeprecationWarning) as ws: - user_agent_parser.ParseWithJSOverrides("") - assert len(ws) == 1 - - def test_printer_deprecation(self): - with pytest.warns(DeprecationWarning) as ws: - user_agent_parser.Pretty("") - assert len(ws) == 1 - - def test_js_bits_deprecation(self): - for parser, count in [ - (user_agent_parser.Parse, 1), - (user_agent_parser.ParseUserAgent, 1), - (user_agent_parser.ParseOS, 1), - (user_agent_parser.ParseDevice, 1), - ]: - user_agent_parser._PARSE_CACHE.clear() - with pytest.warns(DeprecationWarning) as ws: - parser("some random thing", js_attribute=True) - assert len(ws) == count diff --git a/tests/tests.py b/tests/tests.py new file mode 100644 index 00000000..e69de29b diff --git a/tox.ini b/tox.ini index e9162596..a54a726f 100644 --- a/tox.ini +++ b/tox.ini @@ -17,8 +17,7 @@ wheel_build_env = .pkg # for extra deps # extras = deps = - pytest - pyyaml + pytest commands = pytest -Werror --doctest-glob="*.rst" {posargs} diff --git a/uap-core b/uap-core index 7b002982..d4cde4c5 160000 --- a/uap-core +++ b/uap-core @@ -1 +1 @@ -Subproject commit 7b002982f688dd11a23478cd1e101d2c72b2c0e7 +Subproject commit d4cde4c565a7e588472fbf6667f01fc4c23fa60b diff --git a/uap-cpp b/uap-cpp new file mode 160000 index 00000000..05796295 --- /dev/null +++ b/uap-cpp @@ -0,0 +1 @@ +Subproject commit 05796295949028dbd0db4e476c143fd372c68fbd