diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b41c764..6c44397 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,24 +1,33 @@ name: CI on: - workflow_dispatch: pull_request: + push: + workflow_dispatch: + +permissions: + checks: write + contents: write + # deployments permission to deploy GitHub pages website + deployments: write + pull-requests: write + jobs: python-unit: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] defaults: run: working-directory: . steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -37,9 +46,81 @@ jobs: - name: Run unit tests run: | - pytest + pytest --junitxml=junit_pytest_main.xml --cov-report=term-missing:skip-covered + mv .coverage .coverage_main - name: Run Django integration tests working-directory: ./edtf_django_tests run: | - python manage.py test edtf_integration + pytest edtf_integration/tests.py --ds=edtf_django_tests.settings --junitxml=../junit_pytest_django.xml --cov-report=term-missing:skip-covered + mv .coverage ../.coverage_django + + - name: Combine coverage reports + run: | + coverage combine .coverage_main .coverage_django + coverage report --omit="edtf_django_tests/*" + coverage xml -o coverage_combined.xml --omit="edtf_django_tests/*" + + - name: Combine JUnit XML reports + run: | + python combine_junit.py combined_junit_pytest.xml junit_pytest_main.xml junit_pytest_django.xml + + - name: Pytest coverage comment + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-xml-coverage-path: ./coverage_combined.xml + junitxml-path: ./combined_junit_pytest.xml + unique-id-for-comment: ${{ matrix.python-version }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Check the output coverage + run: | + echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}" + echo "Coverage Color - ${{ steps.coverageComment.outputs.color }}" + echo "Coverage Html - ${{ steps.coverageComment.outputs.coverageHtml }}" + echo "Summary Report -" ${{ steps.coverageComment.outputs.summaryReport }} + echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}" + echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}" + echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}" + echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}" + echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}" + echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}" + echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}" + + - name: Run benchmarks + run: | + pytest -m benchmark --benchmark-json=./output.json + + - name: Download previous benchmark data + uses: actions/cache@v5 + with: + path: ./cache + key: ${{ runner.os }}-benchmark + + - name: Publish benchmark results + uses: benchmark-action/github-action-benchmark@v1 + if: github.event_name == 'pull_request' && github.repository == 'ixc/python-edtf' + with: + tool: 'pytest' + auto-push: true + comment-always: true + output-file-path: output.json + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-on-alert: true + save-data-file: true + summary-always: true + + - name: Comment on benchmark results without publishing + if: github.event_name != 'pull_request' || github.repository != 'ixc/python-edtf' + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: 'pytest' + auto-push: false + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: true + output-file-path: output.json + comment-on-alert: false + save-data-file: true + summary-always: true + external-data-json-path: ./cache/benchmark-data.json diff --git a/.github/workflows/coverage_readme.yml b/.github/workflows/coverage_readme.yml new file mode 100644 index 0000000..860ace3 --- /dev/null +++ b/.github/workflows/coverage_readme.yml @@ -0,0 +1,67 @@ +name: Update Coverage on Readme +on: + push: + branches: + - main + +# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs +# `contents` is for permission to the contents of the repository. +# `pull-requests` is for permission to pull request +permissions: + contents: write + checks: write + pull-requests: write + +# see: https://github.com/MishaKav/pytest-coverage-comment +jobs: + update-coverage-on-readme: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Set up Python 3.13 + uses: actions/setup-python@v6 + with: + python-version: 3.13 + cache: 'pip' + cache-dependency-path: '**/pyproject.toml' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[test] + + - name: Run tests and generate coverage + run: | + pytest + mv .coverage .coverage_main + cd edtf_django_tests + coverage run manage.py test edtf_integration + mv .coverage ../.coverage_django + cd .. + coverage combine .coverage_main .coverage_django + coverage report --omit="edtf_django_tests/*" + coverage xml -o coverage_combined.xml --omit="edtf_django_tests/*" + + - name: Pytest coverage comment + if: ${{ github.ref == 'refs/heads/main' }} + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-xml-coverage-path: ./coverage_combined.xml + hide-comment: true + + - name: Update Readme with Coverage Html + if: ${{ github.ref == 'refs/heads/main' }} + run: | + sed -i '//,//c\\n\${{ steps.coverageComment.outputs.coverageHtml }}\n' ./README.md + + - name: Commit & Push changes to Readme + if: ${{ github.ref == 'refs/heads/main' }} + uses: actions-js/push@master + with: + message: Update coverage on Readme + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 7c23190..d27f79d 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,14 @@ htmlcov/ .cache nosetests.xml coverage.xml +coverage_combined.xml +.coverage_main +.coverage_django *,cover +combined_junit_pytest.xml +pytest.xml +junit_pytest_main.xml +junit_pytest_django.xml # Translations *.mo @@ -57,3 +64,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store diff --git a/README.md b/README.md index 98e33b7..206f4f4 100644 --- a/README.md +++ b/README.md @@ -1,80 +1,98 @@ -edtf -===== +# python-edtf -An implementation of EDTF format in Python, together with utility functions for parsing natural language date texts, and converting EDTF dates to related Python `date` or `struct_time` objects. + +Coverage
Coverage Report
FileStmtsMissCoverMissing
edtf
   __init__.py40100% 
   appsettings.py29293%12–13
   convert.py631182%11–19, 21, 72
   fields.py1191190%1, 3–8, 10–13, 15, 23, 29, 31, 33–35, 38–39, 51–58, 60, 63, 65–70, 72–76, 78–79, 81, 83–84, 86, 88–89, 91, 93–95, 97–98, 100, 102–105, 107, 109–112, 114, 123–125, 128, 131–132, 135–136, 139–140, 142–144, 147, 151, 153, 155, 157, 160–173, 179, 181–182, 184–185, 190–191, 193–194, 196, 204, 206, 208–209, 212–213, 223–226, 234
   jdutil.py984455%37, 55, 91–92, 287, 291, 314, 316–317, 319, 321, 346, 348, 350, 370–372, 374, 376, 378, 381–383, 385, 387, 389, 392–393, 395, 397, 399–400, 402, 405–407, 410–413, 415, 417, 424, 431
   tests.py119496%137–138, 148–149
   util.py330100% 
edtf/natlang
   __init__.py20100% 
   en.py1581193%85, 88, 145, 181–182, 192–193, 218–219, 223, 290
   tests.py10190%211
edtf/parser
   __init__.py40100% 
   edtf_exceptions.py15286%24, 29
   grammar.py133496%365, 368, 370, 376
   parser_classes.py66110683%114–116, 123, 126, 188, 194–198, 205–207, 214–218, 227–229, 234–240, 273, 345, 358–359, 390–394, 397, 412, 415–419, 422–426, 444–446, 474, 483, 546, 560, 564, 600, 608, 612, 659–660, 666, 684–685, 688, 694, 700, 702, 706, 713, 754, 779, 785, 789, 804, 808, 898, 908, 919–920, 922, 929, 932, 943, 948, 953, 989, 992, 998, 1003, 1005–1013, 1028, 1033, 1111, 1116, 1149
   tests.py89198%412
TOTAL153730580% 
+ -See http://www.loc.gov/standards/datetime/ for the current draft specification. +An implementation of EDTF format in Python, together with utility functions for parsing natural language date texts, and converting EDTF dates to related Python `date` or `struct_time` objects. -This project is based on python-edtf and was developed to include the newest specification +See for the final draft specification. ## To install - pip install edtf +```shell +pip install edtf +``` ## To use - >>> from edtf import parse_edtf - # Parse an EDTF string to an EDTFObject - >>> e = parse_edtf("1979-08~") # approx August 1979 - >>> e - UncertainOrApproximate: '1979-08~' - # normalised string representation (some different EDTF strings have identical meanings) - >>> unicode(e) - u'1979-08~' - - # Derive Python date objects - # lower and upper bounds that strictly adhere to the given range - >>> e.lower_strict()[:3], e.upper_strict()[:3] - ((1979, 8, 1), (1979, 8, 31)) - # lower and upper bounds that are padded if there's indicated uncertainty - >>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] - ((1979, 7, 1), (1979, 9, 30)) - - # Date intervals - >>> interval = parse_edtf("1979-08~/..") - >>> interval - Level1Interval: '1979-08~/..' - # Intervals have lower and upper EDTF objects. - >>> interval.lower, interval.upper - (UncertainOrApproximate: '1979-08~', UnspecifiedIntervalSection: '..') - >>> interval.lower.lower_strict()[:3], interval.lower.upper_strict()[:3] - ((1979, 8, 1), (1979, 8, 31)) - >>> interval.upper.upper_strict() # '..' is interpreted to mean open interval and is returning -/+ math.inf - math.inf - - # Date collections - >>> coll = parse_edtf('{1667,1668, 1670..1672}') - >>> coll - MultipleDates: '{1667, 1668, 1670..1672}' - >>> coll.objects - (Date: '1667', Date: '1668', Consecutives: '1670..1672') +```python +>>> from edtf import parse_edtf + +# Parse an EDTF string to an EDTFObject +>>> +>>> e = parse_edtf("1979-08~") # approx August 1979 +>>> e +UncertainOrApproximate: '1979-08~' + +# normalised string representation (some different EDTF strings have identical meanings) +>>> +>>> str(e) +'1979-08~' + +# Derive Python date objects + +# lower and upper bounds that strictly adhere to the given range +>>> +>>> e.lower_strict()[:3], e.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) + +# lower and upper bounds that are padded if there's indicated uncertainty +>>> +>>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] +((1979, 7, 1), (1979, 9, 30)) + +# Date intervals +>>> +>>> interval = parse_edtf("1979-08~/..") +>>> interval +Level1Interval: '1979-08~/..' + +# Intervals have lower and upper EDTF objects +>>> +>>> interval.lower, interval.upper +(UncertainOrApproximate: '1979-08~', UnspecifiedIntervalSection: '..') +>>> interval.lower.lower_strict()[:3], interval.lower.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) +>>> interval.upper.upper_strict() # '..' is interpreted to mean open interval and is returning -/+ math.inf +math.inf + +# Date collections +>>> +>>> coll = parse_edtf('{1667,1668, 1670..1672}') +>>> coll +MultipleDates: '{1667, 1668, 1670..1672}' +>>> coll.objects +(Date: '1667', Date: '1668', Consecutives: '1670..1672') +``` The object returned by `parse_edtf()` is an instance of an `edtf.parser.parser_classes.EDTFObject` subclass, depending on the type of date that was parsed. These classes are: - # Level 0 - Date - DateAndTime - Interval - - # Level 1 - UncertainOrApproximate - Unspecified - Level1Interval - UnspecifiedIntervalSection - LongYear - Season - - # Level 2 - PartialUncertainOrApproximate - PartialUnspecified - OneOfASet - MultipleDates - MaskedPrecision - Level2Interval - Level2Season - ExponentialYear - -All of these implement `upper/lower_strict/fuzzy()` methods to derive `struct_time` objects, except of UnspecifiedIntervalSection, that can also return math.inf value +```text +# Level 0 +Date +DateAndTime +Interval + +# Level 1 +UncertainOrApproximate +Unspecified +Level1Interval +UnspecifiedIntervalSection +LongYear +Season + +# Level 2 +PartialUncertainOrApproximate +PartialUnspecified +OneOfASet +MultipleDates +Level2Interval +Level2Season +ExponentialYear +``` + +All of these implement `upper_strict()/lower_strict()/fuzzy()` methods to derive `struct_time` objects, except of UnspecifiedIntervalSection, that can also return math.inf value The `*Interval` instances have `upper` and `lower` properties that are themselves `EDTFObject` instances. @@ -90,163 +108,209 @@ Test coverage includes every example given in the spec table of features. * Date: - >>> parse_edtf('1979-08') # August 1979 - Date: '1979-08' +```python +>>> parse_edtf('1979-08') # August 1979 +Date: '1979-08' +``` * Date and Time: - >>> parse_edtf('2004-01-01T10:10:10+05:00') - DateAndTime: '2004-01-01T10:10:10+05:00' +```python +>>> parse_edtf('2004-01-01T10:10:10+05:00') +DateAndTime: '2004-01-01T10:10:10+05:00' +``` * Interval (start/end): - >>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 - Interval: '1979-08-28/1979-09-25' +```python +>>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 +Interval: '1979-08-28/1979-09-25' +``` ### Level 1 Extensions * Uncertain/Approximate dates: - >>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 - UncertainOrApproximate: '1979-08-28~' +```python +>>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 +UncertainOrApproximate: '1979-08-28~' +``` * Unspecified dates: - >>> parse_edtf('1979-08-XX') # An unknown day in August 1979 - Unspecified: '1979-08-XX' - >>> parse_edtf('1979-XX') # Some month in 1979 - Unspecified: '1979-XX' +```python +>>> parse_edtf('1979-08-XX') # An unknown day in August 1979 +Unspecified: '1979-08-XX' +>>> parse_edtf('1979-XX') # Some month in 1979 +Unspecified: '1979-XX' +``` * Extended intervals: - >>> parse_edtf('1984-06-02?/2004-08-08~') - Level1Interval: '1984-06-02?/2004-08-08~' +```python +>>> parse_edtf('1984-06-02?/2004-08-08~') +Level1Interval: '1984-06-02?/2004-08-08~' +``` * Years exceeding four digits: - >>> parse_edtf('y-12000') # 12000 years BCE - LongYear: 'y-12000' +```python +>>> parse_edtf('Y-12000') # 12000 years BCE +LongYear: 'Y-12000' +``` * Season: - >>> parse_edtf('1979-22') # Summer 1979 - Season: '1979-22' +```python +>>> parse_edtf('1979-22') # Summer 1979 +Season: '1979-22' +``` ### Level 2 Extensions * Partial uncertain/approximate: - >>> parse_edtf('(2011)-06-04~') # year certain, month/day approximate. - # Note that the result text is normalized - PartialUncertainOrApproximate: '2011-(06-04)~' +```python +>>> parse_edtf('2004-06~-11') # year certain, month/day approximate. +PartialUncertainOrApproximate: '2004-06~-11' +``` * Partial unspecified: - >>> parse_edtf('1979-XX-28') # The 28th day of an uncertain month in 1979 - PartialUnspecified: '1979-XX-28' +```python +>>> parse_edtf('1979-XX-28') # The 28th day of an uncertain month in 1979 +PartialUnspecified: '1979-XX-28' +``` * One of a set: - >>> parse_edtf("[..1760-12-03,1762]") - OneOfASet: '[..1760-12-03, 1762]' +```python +>>> parse_edtf("[..1760-12-03,1762]") +OneOfASet: '[..1760-12-03, 1762]' +``` * Multiple dates: - >>> parse_edtf('{1667,1668, 1670..1672}') - MultipleDates: '{1667, 1668, 1670..1672}' - -* Masked precision: - - >>> parse_edtf('197x') # A date in the 1970s. - MaskedPrecision: '197x' +```python +>>> parse_edtf('{1667,1668, 1670..1672}') +MultipleDates: '{1667, 1668, 1670..1672}' +``` * Level 2 Extended intervals: - >>> parse_edtf('2004-06-(01)~/2004-06-(20)~') - Level2Interval: '2004-06-(01)~/2004-06-(20)~' +```python +>>> parse_edtf('2004-06-~01/2004-06-~20') +Level2Interval: '2004-06-~01/2004-06-~20' +``` * Year requiring more than 4 digits - exponential form: - >>> parse_edtf('y-17e7') - ExponentialYear: 'y-17e7' +```python +>>> e = parse_edtf('Y-17E7') +ExponentialYear: 'Y-17E7' +>>> e.estimated() +-170000000 +``` + +* Significant digits: + +```python +# '1950S2': some year between 1900 and 1999, estimated to be 1950 +>>> d = parse_edtf('1950S2') +Date: '1950S2' +>>> d.lower_fuzzy()[:3] +(1900, 1, 1) +>>> d.upper_fuzzy()[:3] +(1999, 12, 31) +# 'Y171010000S3': some year between 171000000 and 171999999 estimated to be 171010000, with 3 significant digits. +>>> l = parse_edtf('Y171010000S3') +LongYear: 'Y171010000S3' +>>> l.estimated() +171010000 +>>> l.lower_fuzzy()[:3] +(171000000, 1, 1) +>>> l.upper_fuzzy()[:3] +(171999999, 12, 31) +# 'Y3388E2S3': some year in exponential notation between 338000 and 338999, estimated to be 338800 +>>> e = parse_edtf('Y3388E2S3') +ExponentialYear: 'Y3388E2S3S3' +>>> e.estimated() +338800 +>>> e.lower_fuzzy()[:3] +(338000, 1, 1) +>>> e.upper_fuzzy()[:3] +(338999, 12, 31) +``` ### Natural language representation - The library includes a basic English natural language parser (it's not yet smart enough to work with occasions such as 'Easter', or in other languages): - >>> from edtf import text_to_edtf - >>> text_to_edtf("circa August 1979") - '1979-08~' +```python +>>> from edtf import text_to_edtf +>>> text_to_edtf("circa August 1979") +'1979-08~' +``` -Note that the result is a string, not an `ETDFObject`. +Note that the result is a string, not an `EDTFObject`. The parser can parse strings such as: - 'January 12, 1940' => '1940-01-12' - '90' => '1990' #implied century - 'January 2008' => '2008-01' - 'the year 1800' => '1800' - '10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering - - # uncertain/approximate - '1860?' => '1860?' - '1862 (uncertain)' => '1862?' - 'circa Feb 1812' => '1812-02~' - 'c.1860' => '1860~' #with or without . - 'ca1860' => '1860~' - 'approx 1860' => '1860~' - - # masked precision - '1860s' => '186x' #186x has decade precision, 186u has year precision. - '1800s' => '18xx' # without uncertainty indicators, assume century - - # masked precision + uncertainty - 'ca. 1860s' => '186x~' - 'circa 1840s' => '184x~' - 'ca. 1860s?' => '186x?~' - 'c1800s?' => '180x?~' # with uncertainty indicators, use the decade - - # unspecified parts - 'January 12' => 'XXXX-01-12' - 'January' => 'XXXX-01' - '7/2008' => '2008-07' - - #seasons - 'Autumn 1872' => '1872-23' - 'Fall 1872' => '1872-23' - - # before/after - 'earlier than 1928' => 'unknown/1928' - 'later than 1928' => '1928/unknown' - 'before January 1928' => 'unknown/1928-01' - 'after about the 1920s' => '192x~/unknown' - - # unspecified - 'year in the 1860s' => '186u' #186x has decade precision, 186u has year precision. - ('year in the 1800s', '18xu') - 'month in 1872' => '1872-XX' - 'day in January 1872' => '1872-01-XX' - 'day in 1872' => '1872-XX-XX' - - #centuries - '1st century' => '00xx' - '10c' => '09xx' - '19th century?' => '18xx?' - - # just showing off now... - 'a day in about Spring 1849?' => '1849-21-XX?~' - - # simple ranges, which aren't as accurate as they could be. The parser is - limited to only picking the first year range it finds. - '1851-1852' => '1851/1852' - '1851-1852; printed 1853-1854' => '1851/1852' - '1851-52' => '1851/1852' - '1856-ca. 1865' => '1856/1865~' - '1860s-1870s' => '186x/187x' - '1920s -early 1930s' => '192x/193x' - '1938, printed 1940s-1950s' => '1938' - +```text +'January 12, 1940' => '1940-01-12' +'90' => '1990' #implied century +'January 2008' => '2008-01' +'the year 1800' => '1800' +'10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering + +# uncertain/approximate +'1860?' => '1860?' +'1862 (uncertain)' => '1862?' +'circa Feb 1812' => '1812-02~' +'c.1860' => '1860~' #with or without . +'ca1860' => '1860~' +'approx 1860' => '1860~' +'ca. 1860s' => '186X~' +'circa 1840s' => '184X~' +'ca. 1860s?' => '186X?~' +'c1800s?' => '180X?~' # with uncertainty indicators, use the decade + +# unspecified parts +'January 12' => 'XXXX-01-12' +'January' => 'XXXX-01' +'7/2008' => '2008-07' +'month in 1872' => '1872-XX' +'day in January 1872' => '1872-01-XX' +'day in 1872' => '1872-XX-XX' + +#seasons +'Autumn 1872' => '1872-23' +'Fall 1872' => '1872-23' + +# before/after +'earlier than 1928' => '/1928' +'later than 1928' => '1928/' +'before January 1928' => '/1928-01' +'after about the 1920s' => '192X~/' + +#centuries +'1st century' => '00XX' +'10c' => '09XX' +'19th century?' => '18XX?' + +# just showing off now... +'a day in about Spring 1849?' => '1849-21-XX?~' + +# simple ranges, which aren't as accurate as they could be. The parser is +limited to only picking the first year range it finds. +'1851-1852' => '1851/1852' +'1851-1852; printed 1853-1854' => '1851/1852' +'1851-52' => '1851/1852' +'1856-ca. 1865' => '1856/1865~' +'1860s-1870s' => '186X/187X' +'1920s - early 1930s' => '192X/193X' +'1938, printed 1940s-1950s' => '1938' +``` Generating natural text from an EDTF representation is a future goal. @@ -260,22 +324,20 @@ Generating natural text from an EDTF representation is a future goal. * If a natural language groups dates with a '/', it's interpreted as "or" rather than "and". The resulting EDTF text is a list bracketed by `[]` ("one of these dates") rather than `{}` (all of these dates). - ## Converting to and from Python dates - Since EDTF dates are often regions, and often imprecise, we need to use a few different Python dates, depending on the circumstance. Generally, Python dates are used for sorting and filtering, and are not displayed directly to users. - ### `struct_time` date representation -Because Python's `datetime` module does not support dates out side the range 1 AD to 9999 AD we return dates as `time.struct_time` objects by default instead of the `datetime.date` or `datetime.datetime` objects you might expect. +Because Python's `datetime` module does not support dates outside the range 1 AD to 9999 AD we return dates as `time.struct_time` objects by default instead of the `datetime.date` or `datetime.datetime` objects you might expect. The `struct_time` representation is more difficult to work with, but can be sorted as-is which is the primary use-case, and can be converted relatively easily to `date` or `datetime` objects (provided the year is within 1 to 9999 AD) or to date objects in more flexible libraries like [astropy.time](http://docs.astropy.org/en/stable/time/index.html) for years outside these bounds. If you are sure you are working with dates within the range supported by Python's `datetime` module, you can get these more convenient objects using the `edtf.struct_time_to_date` and `edtf.struct_time_to_datetime` functions. -NOTE: This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket https://github.com/ixc/python-edtf/issues/26. +> [!NOTE] +> This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket . ### `lower_strict` and `upper_strict` @@ -283,56 +345,109 @@ These dates indicate the earliest and latest dates that are __strictly__ in the In an ascending sort (most recent last), sort by `lower_strict` to get a natural sort order. In a descending sort (most recent first), sort by `upper_strict`: - >>> e = parse_edtf('1912-04~') +```python +>>> e = parse_edtf('1912-04~') - >>> e.lower_strict() # Returns struct_time - >>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) +>>> e.lower_strict() # Returns struct_time +>>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) - >>> e.lower_strict()[:3] # Show only interesting parts of struct_time - (1912, 4, 01) +>>> e.lower_strict()[:3] # Show only interesting parts of struct_time +(1912, 4, 01) - >>> from edtf import struct_time_to_date - >>> struct_time_to_date(e.lower_strict()) # Convert to date - datetime.date(1912, 4, 01) +>>> from edtf import struct_time_to_date +>>> struct_time_to_date(e.lower_strict()) # Convert to date +datetime.date(1912, 4, 01) - >>> e.upper_strict()[:3] - (1912, 4, 30) +>>> e.upper_strict()[:3] +(1912, 4, 30) - >>> struct_time_to_date(e.upper_strict()) - datetime.date(1912, 4, 30) +>>> struct_time_to_date(e.upper_strict()) +datetime.date(1912, 4, 30) +``` ### `lower_fuzzy` and `upper_fuzzy` ------------------------------------ These dates indicate the earliest and latest dates that are __possible__ in the date range, for a fairly arbitrary definition of 'possibly'. These values are useful for filtering results - i.e. testing which EDTF dates might conceivably fall into, or overlap, a desired date range. -The fuzzy dates are derived from the strict dates, plus or minus a level of padding that depends on how precise the date specfication is. For the case of approximate or uncertain dates, we (arbitrarily) pad the ostensible range by 100% of the uncertain timescale, or by a 12 weeks in the case of seasons. That is, if a date is approximate at the month scale, it is padded by a month. If it is approximate at the year scale, it is padded by a year: +The fuzzy dates are derived from the strict dates, plus or minus a level of padding that depends on how precise the date specification is. For the case of approximate or uncertain dates, we (arbitrarily) pad the ostensible range by 100% of the uncertain timescale, or by a 12 weeks in the case of seasons. That is, if a date is approximate at the month scale, it is padded by a month. If it is approximate at the year scale, it is padded by a year: - >>> e = parse_edtf('1912-04~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a month - (1912, 3, 1) - >>> e.upper_fuzzy()[:3] - (1912, 5, 30) +```python +>>> e = parse_edtf('1912-04~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a month +(1912, 3, 1) +>>> e.upper_fuzzy()[:3] +(1912, 5, 30) - >>> e = parse_edtf('1912~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a year - (1911, 1, 1) - >>> e.upper_fuzzy()[:3] - (1913, 12, 31) +>>> e = parse_edtf('1912~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a year +(1911, 1, 1) +>>> e.upper_fuzzy()[:3] +(1913, 12, 31) +``` One can interpret uncertain or approximate dates as 'plus or minus a [level of precision]'. If a date is both uncertain __and__ approximate, the padding is applied twice, i.e. it gets 100% * 2 padding, or 'plus or minus two [levels of precision]'. +### Qualification properties + +EDTF objects support properties that provide an overview of how the object is qualified: + +* `.is_uncertain (?)` +* `.is_approximate (~)` +* `.is_uncertain_and_approximate (%)` + +These properties represent whether the any part of the date object is uncertain, approximate, or uncertain and approximate. For ranges, the properties are true if any part of the range (lower or upper section) is qualified as such. A date is not necessarily uncertain and approximate if it is separately both uncertain and approximate - it must have the "%" qualifier to be considered uncertain and approximate. + +```python +>>> parse_edtf("2006-06-11") +Date: '2006-06-11' +>>> parse_edtf("2006-06-11").is_uncertain +False +>>> parse_edtf("2006-06-11").is_approximate +False + +>>> parse_edtf("1984?") +UncertainOrApproximate: '1984?' +>>> parse_edtf("1984?").is_approximate +False +>>> parse_edtf("1984?").is_uncertain +True +>>> parse_edtf("1984?").is_uncertain_and_approximate +False + +>>> parse_edtf("1984%").is_uncertain +False +>>> parse_edtf("1984%").is_uncertain_and_approximate +True + +>>> parse_edtf("1984~/2004-06") +Level1Interval: '1984~/2004-06' +>>> parse_edtf("1984~/2004-06").is_approximate +True +>>> parse_edtf("1984~/2004-06").is_uncertain +False + +>>> parse_edtf("2004?-~06-~04") +PartialUncertainOrApproximate: '2004?-~06-~04' +>>> parse_edtf("2004?-~06-~04").is_approximate +True +>>> parse_edtf("2004?-~06-~04").is_uncertain +True +>>> parse_edtf("2004?-~06-~04").is_uncertain_and_approximate +False +``` + ### Seasons -Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in `appsettings.py`. +> [!IMPORTANT] +> Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in [`appsettings.py`](edtf/appsettings.py). ### Comparisons -Two EDTF dates are considered equal if their unicode() representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. +Two EDTF dates are considered equal if their `str()` representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. ## Django ORM field @@ -342,53 +457,64 @@ To store a natural language value on your model, define another field, and set t When your model is saved, the `natural_text_field` value will be parsed to set the `date_edtf` value, and the underlying EDTF object will set the `_earliest` and `_latest` fields on the model to a float value representing the Julian Date. - -**WARNING**: The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should **not** be used for definitive storage or for display after roundtrip conversions. +> [!WARNING] +> The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should __not__ be used for definitive storage or for display after roundtrip conversions. Example usage: - from django.db import models - from edtf.fields import EDTFField - - class MyModel(models.Model): - date_display = models.CharField( - "Date of creation (display)", - blank=True, - max_length=255, - ) - date_edtf = EDTFField( - "Date of creation (EDTF)", - natural_text_field='date_display', - lower_fuzzy_field='date_earliest', - upper_fuzzy_field='date_latest', - lower_strict_field='date_sort_ascending', - upper_strict_field='date_sort_descending', - blank=True, - null=True, - ) - # use for filtering - date_earliest = models.FloatField(blank=True, null=True) - date_latest = models.FloatField(blank=True, null=True) - # use for sorting - date_sort_ascending = models.FloatField(blank=True, null=True) - date_sort_descending = models.FloatField(blank=True, null=True) - +```python +from django.db import models +from edtf.fields import EDTFField + +class MyModel(models.Model): + date_display = models.CharField( + "Date of creation (display)", + blank=True, + max_length=255, + ) + date_edtf = EDTFField( + "Date of creation (EDTF)", + natural_text_field='date_display', + lower_fuzzy_field='date_earliest', + upper_fuzzy_field='date_latest', + lower_strict_field='date_sort_ascending', + upper_strict_field='date_sort_descending', + blank=True, + null=True, + ) + # use for filtering + date_earliest = models.FloatField(blank=True, null=True) + date_latest = models.FloatField(blank=True, null=True) + # use for sorting + date_sort_ascending = models.FloatField(blank=True, null=True) + date_sort_descending = models.FloatField(blank=True, null=True) +``` Since the `EDTFField` and the `_earliest` and `_latest` field values are set automatically, you may want to make them readonly, or not visible in your model admin. ## To develop + ### Setup -- Clone the repository: `git clone https://github.com/ixc/python-edtf.git` -- Set up a virtual environment: `python3 -m venv venv` -- Install the dependencies: `pip install -r dev-requirements.txt` -- Install precommit hooks: `pre-commit install` + +* Clone the repository: `git clone https://github.com/ixc/python-edtf.git` +* Set up a virtual environment: `python3 -m venv venv` +* Install the dependencies: `pip install -r dev-requirements.txt` +* Install precommit hooks: `pre-commit install` ### Running tests -- From `python-edtf`, run the unit tests: `pytest` -- From `python-edtf/edtf_django_tests`, run the integration tests: `python manage.py test edtf_integration` + +* From `python-edtf`, run the unit tests: `pytest` +* From `python-edtf`, run `pytest -m benchmark` to run the benchmarks (published [here]( https://ixc.github.io/python-edtf/dev/bench/)) +* From `python-edtf/edtf_django_tests`, run the integration tests: `python manage.py test edtf_integration` +* To run CI locally, use `act`, e.g. `act pull_request` or `act --pull=false --container-architecture linux/amd64`. Some steps may require a GitHub PAT: `act pull_request --container-architecture linux/amd64 --pull=false -s GITHUB_TOKEN=` ### Linting and formatting -- Check linting: `ruff check --output-format=github --config pyproject.toml` -- Check formatting: `ruff format --check --config pyproject.toml` -- Fix formatting: `ruff format --config pyproject.toml` -- Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. + +* Check linting: `ruff check --output-format=github --config pyproject.toml` +* Check formatting: `ruff format --check --config pyproject.toml` +* Fix formatting: `ruff format --config pyproject.toml` +* Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. + +### Coverage and benchmarks + +Coverage reports are generated and added as comments to commits, and also visible in the actions log. Benchmarks are run on pull requests and are published [here]( https://ixc.github.io/python-edtf/dev/bench/) and also visible in the actions log. diff --git a/combine_junit.py b/combine_junit.py new file mode 100644 index 0000000..5e3a05b --- /dev/null +++ b/combine_junit.py @@ -0,0 +1,23 @@ +import sys + +from junitparser import JUnitXml + + +def combine_junit_xml(output_file, *input_files): + combined_xml = JUnitXml() + for input_file in input_files: + xml = JUnitXml.fromfile(input_file) + combined_xml.extend(xml) + combined_xml.write(output_file) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Usage: python combine_junit_xml.py ... " + ) + sys.exit(1) + + output_file = sys.argv[1] + input_files = sys.argv[2:] + combine_junit_xml(output_file, *input_files) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1e37df5..c27d485 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,8 @@ -r requirements.txt # Include all main requirements django>=4.2,<5.0 pytest +pytest-benchmark +pytest-cov +pytest-django ruff pre-commit diff --git a/edtf/__init__.py b/edtf/__init__.py index 7bb2885..0b0bfbf 100644 --- a/edtf/__init__.py +++ b/edtf/__init__.py @@ -22,6 +22,7 @@ UncertainOrApproximate, Unspecified, UnspecifiedIntervalSection, + is_valid_edtf, parse_edtf, ) @@ -46,6 +47,7 @@ "trim_struct_time", "text_to_edtf", "parse_edtf", + "is_valid_edtf", # parser_exceptions "EDTFParseException", # parser_classes diff --git a/edtf/appsettings.py b/edtf/appsettings.py index e1bc821..e9b4d9d 100644 --- a/edtf/appsettings.py +++ b/edtf/appsettings.py @@ -12,7 +12,7 @@ except ImportError: EDTF = {} -SEASON_MONTHS_RANGE = EDTF.get( +SEASON_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -27,7 +27,7 @@ }, ) -SEASON_L2_MONTHS_RANGE = EDTF.get( +SEASON_L2_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_L2_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -67,9 +67,9 @@ }, ) -DAY_FIRST = EDTF.get("DAY_FIRST", False) # Americans! +DAY_FIRST: bool = EDTF.get("DAY_FIRST", False) # Americans! -SEASONS = EDTF.get( +SEASONS: dict[int, str] = EDTF.get( "SEASONS", { 21: "spring", @@ -78,16 +78,38 @@ 24: "winter", }, ) -INVERSE_SEASONS = EDTF.get("INVERSE_SEASONS", {v: k for k, v in SEASONS.items()}) +INVERSE_SEASONS: dict[str, int] = EDTF.get( + "INVERSE_SEASONS", {v: k for k, v in SEASONS.items()} +) # also need to interpret `fall` INVERSE_SEASONS["fall"] = 23 # changing these will break tests -PADDING_DAY_PRECISION = EDTF.get("PADDING_DAY_PRECISION", relativedelta(days=1)) -PADDING_MONTH_PRECISION = EDTF.get("PADDING_MONTH_PRECISION", relativedelta(months=1)) -PADDING_YEAR_PRECISION = EDTF.get("PADDING_YEAR_PRECISION", relativedelta(years=1)) -PADDING_SEASON_PRECISION = EDTF.get("PADDING_SEASON_PRECISION", relativedelta(weeks=12)) -MULTIPLIER_IF_UNCERTAIN = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) -MULTIPLIER_IF_APPROXIMATE = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) -MULTIPLIER_IF_BOTH = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) -DELTA_IF_UNKNOWN = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +PADDING_DAY_PRECISION: relativedelta = EDTF.get( + "PADDING_DAY_PRECISION", relativedelta(days=1) +) +PADDING_MONTH_PRECISION: relativedelta = EDTF.get( + "PADDING_MONTH_PRECISION", relativedelta(months=1) +) +PADDING_YEAR_PRECISION: relativedelta = EDTF.get( + "PADDING_YEAR_PRECISION", relativedelta(years=1) +) +PADDING_SEASON_PRECISION: relativedelta = EDTF.get( + "PADDING_SEASON_PRECISION", relativedelta(weeks=12) +) +PADDING_DECADE_PRECISION: relativedelta = EDTF.get( + "PADDING_DECADE_PRECISION", relativedelta(years=10) +) +PADDING_CENTURY_PRECISION: relativedelta = EDTF.get( + "PADDING_CENTURY_PRECISION", relativedelta(years=100) +) +PADDING_MILLENNIUM_PRECISION: relativedelta = EDTF.get( + "PADDING_MILLENNIUM_PRECISION", relativedelta(years=1000) +) +MULTIPLIER_IF_UNCERTAIN: float = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) +MULTIPLIER_IF_APPROXIMATE: float = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) +MULTIPLIER_IF_BOTH: float = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) +DELTA_IF_UNKNOWN: relativedelta = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +DELTA_IF_EMPTY: relativedelta = relativedelta(None) + +DEBUG_PYPARSING: bool = False diff --git a/edtf/convert.py b/edtf/convert.py index a294462..c03e2ea 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -21,7 +21,7 @@ def old_specs_to_new_specs_expression(expression): return expression -def dt_to_struct_time(dt): +def dt_to_struct_time(dt) -> struct_time: """ Convert a `datetime.date` or `datetime.datetime` to a `struct_time` representation *with zero values* for data fields that we cannot always @@ -39,11 +39,10 @@ def dt_to_struct_time(dt): return struct_time( [dt.year, dt.month, dt.day] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS ) - else: - raise NotImplementedError(f"Cannot convert {type(dt)} to `struct_time`") + raise NotImplementedError(f"Cannot convert {type(dt)} to `struct_time`") -def struct_time_to_date(st): +def struct_time_to_date(st: struct_time) -> date: """ Return a `datetime.date` representing the provided `struct_time. @@ -52,7 +51,7 @@ def struct_time_to_date(st): return date(*st[:3]) -def struct_time_to_datetime(st): +def struct_time_to_datetime(st: struct_time) -> datetime: """ Return a `datetime.datetime` representing the provided `struct_time. @@ -61,7 +60,7 @@ def struct_time_to_datetime(st): return datetime(*st[:6]) -def trim_struct_time(st, strip_time=False): +def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time: """ Return a `struct_time` based on the one provided but with the extra fields `tm_wday`, `tm_yday`, and `tm_isdst` reset to default values. @@ -71,11 +70,10 @@ def trim_struct_time(st, strip_time=False): """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) -def struct_time_to_jd(st): +def struct_time_to_jd(st: struct_time) -> float: """ Return a float number representing the Julian Date for the given `struct_time`. @@ -91,7 +89,7 @@ def struct_time_to_jd(st): return jdutil.date_to_jd(year, month, day) -def jd_to_struct_time(jd): +def jd_to_struct_time(jd: float) -> struct_time: """ Return a `struct_time` converted from a Julian Date float number. @@ -117,7 +115,7 @@ def jd_to_struct_time(jd): return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -153,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/fields.py b/edtf/fields.py index f717592..07a9744 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -1,13 +1,16 @@ import pickle +from django.core import checks from django.core.exceptions import FieldDoesNotExist from django.db import models from django.db.models import signals from django.db.models.query_utils import DeferredAttribute +from pyparsing import ParseException from edtf import EDTFObject, parse_edtf from edtf.convert import struct_time_to_date, struct_time_to_jd from edtf.natlang import text_to_edtf +from edtf.parser.edtf_exceptions import EDTFParseException DATE_ATTRS = ( "lower_strict", @@ -46,21 +49,12 @@ def __init__( **kwargs, ): kwargs["max_length"] = 2000 - ( - self.natural_text_field, - self.direct_input_field, - self.lower_strict_field, - self.upper_strict_field, - self.lower_fuzzy_field, - self.upper_fuzzy_field, - ) = ( - natural_text_field, - direct_input_field, - lower_strict_field, - upper_strict_field, - lower_fuzzy_field, - upper_fuzzy_field, - ) + self.natural_text_field = natural_text_field + self.direct_input_field = direct_input_field + self.lower_strict_field = lower_strict_field + self.upper_strict_field = upper_strict_field + self.lower_fuzzy_field = lower_fuzzy_field + self.upper_fuzzy_field = upper_fuzzy_field super().__init__(verbose_name, name, **kwargs) description = ( @@ -72,6 +66,8 @@ def deconstruct(self): name, path, args, kwargs = super().deconstruct() if self.natural_text_field: kwargs["natural_text_field"] = self.natural_text_field + if self.direct_input_field: + kwargs["direct_input_field"] = self.direct_input_field for attr in DATE_ATTRS: field = f"{attr}_field" @@ -132,13 +128,15 @@ def update_values(self, instance, *args, **kwargs): if direct_input and ( existing_value is None or str(existing_value) != direct_input ): - edtf = parse_edtf( - direct_input, fail_silently=True - ) # ParseException if invalid; should this be raised? - # TODO pyparsing.ParseExceptions are very noisy and dumps the whole grammar (see https://github.com/ixc/python-edtf/issues/46) + try: + edtf = parse_edtf( + direct_input, fail_silently=True + ) # ParseException if invalid; should this be raised? + except ParseException as err: + raise EDTFParseException(direct_input, err) from None # set the natural_text (display) field to the direct_input if it is not provided - if natural_text == "": + if not natural_text: setattr(instance, self.natural_text_field, direct_input) elif natural_text: @@ -148,7 +146,7 @@ def update_values(self, instance, *args, **kwargs): ): edtf = parse_edtf( edtf_string, fail_silently=True - ) # potetial ParseException if invalid; should this be raised? + ) # potential ParseException if invalid; should this be raised? else: edtf = existing_value else: @@ -191,3 +189,46 @@ def contribute_to_class(self, cls, name, **kwargs): # Only run post-initialization values update on non-abstract models if not cls._meta.abstract: signals.post_init.connect(self.update_values, sender=cls) + + def check(self, **kwargs): + errors = super().check(**kwargs) + + for field_alias in [ + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ]: + errors.extend(self._check_field(field_alias)) + + return errors + + def _check_field(self, field_alias): + field_name = getattr(self, field_alias, None) + + # Check if the alias value has been provided in the field definition + if not field_name: + return [ + checks.Error( + f"You must specify a '{field_alias}' for EDTFField", + hint=None, + obj=self, + id="python-edtf.EDTF01", + ) + ] + + # Check if the field that is referenced actually exists + try: + self.model._meta.get_field(field_name) + except FieldDoesNotExist: + return [ + checks.Error( + f"'{self.name}' refers to a non-existent '{field_alias}' field: '{field_name}'", + hint=None, + obj=self, + id="python-edtf.EDTF02", + ) + ] + return [] diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 16cd312..b7a2cbb 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -18,7 +18,7 @@ # time deltas if one date is from before 10-15-1582. -def mjd_to_jd(mjd): +def mjd_to_jd(mjd: float) -> float: """ Convert Modified Julian Day to Julian Day. @@ -37,7 +37,7 @@ def mjd_to_jd(mjd): return mjd + 2400000.5 -def jd_to_mjd(jd): +def jd_to_mjd(jd: float) -> float: """ Convert Julian Day to Modified Julian Day @@ -55,7 +55,7 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year, month, day): +def date_to_jd(year: int, month: int, day: float) -> float: """ Convert a date to Julian Day. @@ -117,7 +117,7 @@ def date_to_jd(year, month, day): return jd -def jd_to_date(jd): +def jd_to_date(jd: float) -> tuple: """ Convert Julian Day to date. @@ -175,7 +175,7 @@ def jd_to_date(jd): return year, month, day -def hmsm_to_days(hour=0, min=0, sec=0, micro=0): +def hmsm_to_days(hour: int = 0, min: int = 0, sec: int = 0, micro: int = 0) -> float: """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -262,7 +262,7 @@ def days_to_hmsm(days): return int(hour), int(min), int(sec), int(micro) -def datetime_to_jd(date): +def datetime_to_jd(date: dt.datetime) -> float: """ Convert a `datetime.datetime` object to Julian Day. @@ -291,7 +291,7 @@ def datetime_to_jd(date): return date_to_jd(date.year, date.month, days) -def jd_to_datetime(jd): +def jd_to_datetime(jd: float) -> dt.datetime: """ Convert a Julian Day to an `jdutil.datetime` object. @@ -321,7 +321,7 @@ def jd_to_datetime(jd): return datetime(year, month, day, hour, min, sec, micro) -def timedelta_to_days(td): +def timedelta_to_days(td: dt.timedelta) -> float: """ Convert a `datetime.timedelta` object to a total number of days. @@ -396,7 +396,7 @@ def __sub__(self, other): return jd_to_datetime(combined) - elif isinstance(other, (datetime, dt.datetime)): + elif isinstance(other, datetime | dt.datetime): diff = datetime_to_jd(self) - datetime_to_jd(other) return dt.timedelta(diff) @@ -407,7 +407,7 @@ def __sub__(self, other): raise TypeError(s) def __rsub__(self, other): - if not isinstance(other, (datetime, dt.datetime)): + if not isinstance(other, datetime | dt.datetime): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" raise TypeError(s) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 5fb2fea..077ae19 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,10 +1,10 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools import re from datetime import datetime from dateutil.parser import ParserError, parse -from six.moves import xrange from edtf import appsettings @@ -14,29 +14,55 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r"(-?)([\du])([\dxu])([\dxu])([\dxu])" -LONG_YEAR_RE = r"y(-?)([1-9]\d\d\d\d+)" -CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" -CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)" +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|^~" +) +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") + +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) + +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") + +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -def text_to_edtf(text): +@functools.lru_cache +def text_to_edtf(text: str) -> str | None: """ Generate EDTF string equivalent of a given natural language date string. """ if not text: - return + return None t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: str | None = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. @@ -44,7 +70,8 @@ def text_to_edtf(text): for split in [",", ";", "or"]: for list_item in t.split(split): # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -52,19 +79,20 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9 + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 if re.search( - r"\b\d\d\d\d$", d1 + PARTIAL_CHECK, d1 ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10 - if re.search(r"\b\d\d\d\d$", d1): + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search( - r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]", - f"{d1}-{d2}", - ) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() d1 = f"{g[0]}C" @@ -74,7 +102,7 @@ def text_to_edtf(text): r2 = text_to_edtf_date(d2) if r1 and r2: - result = r1 + "/" + r2 + result = f"{r1}/{r2}" return result # is it an either/or year "1838/1862" - that has a different @@ -83,7 +111,7 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: return f"[{int_match.group(1)}, {int_match.group(2)}]" @@ -93,22 +121,19 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r"\bbefore\b", t) - is_before = is_before or re.findall(r"\bearlier\b", t) - - is_after = re.findall(r"\bafter\b", t) - is_after = is_after or re.findall(r"\bsince\b", t) - is_after = is_after or re.findall(r"\blater\b", t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"unknown/{result}" + result = f"/{result}" elif is_after: - result = f"{result}/unknown" + result = f"{result}/" return result -def text_to_edtf_date(text): +@functools.lru_cache +def text_to_edtf_date(text: str) -> str | None: """ Return EDTF string equivalent of a given natural language date string. @@ -117,37 +142,28 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() - result = "" + result: str = "" - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r"(\d{2}00)s", t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r"\bcirca\b", t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r"\b~\d{4}", t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r"^~", t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -155,32 +171,29 @@ def text_to_edtf_date(text): # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dxx" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t) + result = f"{int(is_century[0][0]) - 1:02d}XX" + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: + if is_century[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass elif is_ce: - result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t) + result = f"{int(is_ce[0][0]):04d}" + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: + if is_ce[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. @@ -201,46 +214,46 @@ def text_to_edtf_date(text): ) except ParserError: - return + return None except Exception: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r"\byear\b.+(in|during)\b", t) - mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t) - mentions_day = re.findall(r"\bday\b.+(in|during)\b", t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) - for i in xrange(len(date1)): + for i, char in enumerate(date1): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += "x" - elif i == 3 and is_decade > 0: + result += "X" + elif i == 3 and is_decade: if mentions_year: - result += "u" # year precision + result += "X" # year precision else: - result += "x" # decade precision - elif date1[i] == date2[i]: + result += "X" # decade precision + elif char == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default - result += date1[i] + result += char else: # different values were produced, meaning that it's likely # a default. Use 'unspecified' - result += "u" + result += "X" # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): - if result[i] not in ("u", "x", "-"): + for i in reversed(range(len(result))): + if result[i] not in ("X", "-"): smallest_length = 4 if mentions_month: @@ -264,14 +277,16 @@ def text_to_edtf_date(text): # end dateutil post-parsing - if is_uncertain: - result += "?" - - if is_approximate: - result += "~" + if is_uncertain and is_approximate: + result += "%" + else: + if is_uncertain: + result += "?" + if is_approximate: + result += "~" # weed out bad parses - if result.startswith("uu-uu"): + if result.startswith("XX-XX"): return None return result diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index 3602775..e0acaad 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -4,16 +4,15 @@ from edtf.natlang.en import text_to_edtf -# TODO update the tests and code to test and output the new spec - +# TODO update the tests and code to test and output the new spec # where examples are tuples, the second item is the normalised output @pytest.mark.parametrize( "input_text,expected_output", [ # Ignoring 'late' for simplicity in these examples - ("active late 17th-19th centuries", "16xx/18xx"), - ("active 17-19th Centuries", "16xx/18xx"), + ("active late 17th-19th centuries", "16XX/18XX"), + ("active 17-19th Centuries", "16XX/18XX"), # Unrecognised values ("", None), ("this isn't a date", None), @@ -56,23 +55,21 @@ "1802", ), # Avoid false positive 'circa' at the end of preceding word ("attic. 1802", "1802"), # Avoid false positive 'circa' - # Masked precision - ("1860s", "186x"), # 186x has decade precision, 186u has year precision. - # Masked precision + uncertainty - ("ca. 1860s", "186x~"), - ("c. 1860s", "186x~"), - ("Circa 1840s", "184x~"), - ("circa 1840s", "184x~"), - ("ca. 1860s?", "186x?~"), - ("uncertain: approx 1862", "1862?~"), - # Ambiguous masked precision for centuries and decades - ("1800s", "18xx"), # Without additional uncertainty, use the century - ("2000s", "20xx"), # Without additional uncertainty, use the century - ("c1900s", "190x~"), # If there's additional uncertainty, use the decade - ("c1800s?", "180x?~"), # If there's additional uncertainty, use the decade + # Previously tested masked precision, uncertain or ambiguous masked precision + ("1860s", "186X"), + ("ca. 1860s", "186X~"), + ("c. 1860s", "186X~"), + ("Circa 1840s", "184X~"), + ("circa 1840s", "184X~"), + ("ca. 1860s?", "186X%"), + ("uncertain: approx 1862", "1862%"), + ("1800s", "18XX"), + ("2000s", "20XX"), + ("c1900s", "190X~"), + ("c1800s?", "180X%"), # Unspecified dates - ("January 12", "uuuu-01-12"), - ("January", "uuuu-01"), + ("January 12", "XXXX-01-12"), + ("January", "XXXX-01"), ("10/7/2008", "2008-10-07"), ("7/2008", "2008-07"), # Seasons mapped to specific codes @@ -82,45 +79,46 @@ ("Fall 1872", "1872-23"), ("Winter 1872", "1872-24"), # Dates relative to known events (before/after) - ("earlier than 1928", "unknown/1928"), - ("before 1928", "unknown/1928"), - ("after 1928", "1928/unknown"), - ("later than 1928", "1928/unknown"), - ("before January 1928", "unknown/1928-01"), - ("before 18 January 1928", "unknown/1928-01-18"), + ("earlier than 1928", "/1928"), + ("before 1928", "/1928"), + ("after 1928", "1928/"), + ("later than 1928", "1928/"), + ("before January 1928", "/1928-01"), + ("before 18 January 1928", "/1928-01-18"), # Approximations combined with before/after - ("before approx January 18 1928", "unknown/1928-01-18~"), - ("before approx January 1928", "unknown/1928-01~"), - ("after approx January 1928", "1928-01~/unknown"), - ("after approx Summer 1928", "1928-22~/unknown"), + ("before approx January 18 1928", "/1928-01-18~"), + ("before approx January 1928", "/1928-01~"), + ("after approx January 1928", "1928-01~/"), + ("after approx Summer 1928", "1928-22~/"), # Before and after with uncertain / unspecified components - ("after about the 1920s", "192x~/unknown"), - ("before about the 1900s", "unknown/190x~"), - ("before the 1900s", "unknown/19xx"), - # Specifying unspecified components within a date - # ('decade in 1800s', '18ux'), #too esoteric - # ('decade somewhere during the 1800s', '18ux'), #lengthier. Keywords are 'in' or 'during' - ("year in the 1860s", "186u"), # 186x has decade precision - ("year in the 1800s", "18xu"), # 186u has year precision - ("year in about the 1800s", "180u~"), - ("month in 1872", "1872-uu"), - ("day in Spring 1849", "1849-21-uu"), - ("day in January 1872", "1872-01-uu"), - ("day in 1872", "1872-uu-uu"), + ("after about the 1920s", "192X~/"), + ("before about the 1900s", "/190X~"), + ("before the 1900s", "/19XX"), + # previous examples for masked precision, now removed from the EDTF spec + # use `X` for unknown regardless of precision or why the data is unknown + ("decade in 1800s", "18XX"), + ("decade somewhere during the 1800s", "18XX"), + ("year in the 1860s", "186X"), + ("year in the 1800s", "18XX"), + ("year in about the 1800s", "180X~"), + ("month in 1872", "1872-XX"), + ("day in Spring 1849", "1849-21-XX"), + ("day in January 1872", "1872-01-XX"), + ("day in 1872", "1872-XX-XX"), ("birthday in 1872", "1872"), # Handling centuries with approximation and uncertainty - ("1st century", "00xx"), - ("10c", "09xx"), - ("19th century", "18xx"), - ("19th century?", "18xx?"), - ("before 19th century", "unknown/18xx"), - ("19c", "18xx"), - ("15c.", "14xx"), - ("ca. 19c", "18xx~"), - ("~19c", "18xx~"), - ("about 19c", "18xx~"), - ("19c?", "18xx?"), - ("c.19c?", "18xx?~"), + ("1st century", "00XX"), + ("10c", "09XX"), + ("19th century", "18XX"), + ("19th century?", "18XX?"), + ("before 19th century", "/18XX"), + ("19c", "18XX"), + ("15c.", "14XX"), + ("ca. 19c", "18XX~"), + ("~19c", "18XX~"), + ("about 19c", "18XX~"), + ("19c?", "18XX?"), + ("c.19c?", "18XX%"), # BC/AD dating ("1 AD", "0001"), ("17 CE", "0017"), @@ -131,12 +129,12 @@ ("c127 CE", "0127~"), ("c1270 CE", "1270~"), ("c64 BCE", "-0064~"), - ("2nd century bc", "-01xx"), # -200 to -101 - ("2nd century bce", "-01xx"), - ("2nd century ad", "01xx"), - ("2nd century ce", "01xx"), + ("2nd century bc", "-01XX"), # -200 to -101 + ("2nd century bce", "-01XX"), + ("2nd century ad", "01XX"), + ("2nd century ce", "01XX"), # Combining uncertainties and approximations in creative ways - ("a day in about Spring 1849?", "1849-21-uu?~"), + ("a day in about Spring 1849?", "1849-21-XX%"), # Simple date ranges, showcasing both the limitations and capabilities of the parser # Not all of these results are correct EDTF, but this is as good as the EDTF implementation # and simple natural language parser we have. @@ -145,9 +143,9 @@ ("1851-52", "1851/1852"), ("1852 - 1860", "1852/1860"), ("1856-ca. 1865", "1856/1865~"), - ("1857-mid 1860s", "1857/186x"), + ("1857-mid 1860s", "1857/186X"), ("1858/1860", "[1858, 1860]"), - ("1860s-1870s", "186x/187x"), + ("1860s-1870s", "186X/187X"), ("1910-30", "1910/1930"), ("active 1910-30", "1910/1930"), ("1861-67", "1861/1867"), @@ -160,33 +158,22 @@ ("1864-1872, printed 1870s", "1864/1872"), ("1868-1871?", "1868/1871?"), ("1869-70", "1869/1870"), - ("1870s, printed ca. 1880s", "187x"), + ("1870s, printed ca. 1880s", "187X"), ("1900-1903, cast before 1929", "1900/1903"), ("1900; 1973", "1900"), ("1900; printed 1912", "1900"), ("1915 late - autumn 1916", "1915/1916-23"), ("1915, from Camerawork, October 1916", "1915"), # should be {1915, 1916-10} - ("1920s -early 1930s", "192x/193x"), + ("1920s -early 1930s", "192X/193X"), ( "1930s, printed early 1960s", - "193x", + "193X", ), # should be something like {193x, 196x}, ("1932, printed 1976 by Gunther Sander", "1932"), # should be {1932, 1976} ( "1938, printed 1940s-1950s", "1938", ), # should be something like {1938, 194x-195x} - # Uncertain and approximate on different parts of the date - # for these to work we need to recast is_uncertain and is_approximate - # such that they work on different parts. Probably worth rolling our own - # dateparser at this point. - # ('July in about 1849', '1849~-07'), - # ('a day in July in about 1849', '1849~-07-uu'), - # ('a day in Spring in about 1849', '1849~-21-uu'), - # ('a day in about July? in about 1849', '1849~-07?~-uu'), - # ('a day in about Spring in about 1849', '1849~-21~-uu'), - # ('maybe January in some year in about the 1830s', '183u~-01?'), - # ('about July? in about 1849', '1849~-07?~'), ], ) def test_natlang(input_text, expected_output): @@ -195,4 +182,30 @@ def test_natlang(input_text, expected_output): Verify that the conversion from text to EDTF format matches the expected output. """ result = text_to_edtf(input_text) - assert result == expected_output, f"Failed for input: {input_text}" + assert result == expected_output, ( + f"Failed for input: {input_text} - expected {expected_output}, got {result}" + ) + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "input_text,expected_output", + [ + ("23rd Dynasty", None), + ("January 2008", "2008-01"), + ("ca1860", "1860~"), + ("uncertain: approx 1862", "1862%"), + ("January", "XXXX-01"), + ("Winter 1872", "1872-24"), + ("before approx January 18 1928", "/1928-01-18~"), + ("birthday in 1872", "1872"), + ("1270 CE", "1270"), + ("2nd century bce", "-01XX"), + ("1858/1860", "[1858, 1860]"), + ], +) +def test_benchmark_natlang(benchmark, input_text, expected_output): + """ + Benchmark selected natural language conversions + """ + benchmark(text_to_edtf, input_text) diff --git a/edtf/parser/__init__.py b/edtf/parser/__init__.py index 43197d5..9cbf3c3 100644 --- a/edtf/parser/__init__.py +++ b/edtf/parser/__init__.py @@ -1,5 +1,5 @@ from .edtf_exceptions import EDTFParseException -from .grammar import parse_edtf +from .grammar import is_valid_edtf, parse_edtf from .parser_classes import ( UA, Consecutives, @@ -26,6 +26,7 @@ __all__ = [ "parse_edtf", + "is_valid_edtf", "EDTFParseException", "EDTFObject", "Date", diff --git a/edtf/parser/edtf_exceptions.py b/edtf/parser/edtf_exceptions.py index 9530602..d906d58 100644 --- a/edtf/parser/edtf_exceptions.py +++ b/edtf/parser/edtf_exceptions.py @@ -2,4 +2,28 @@ class EDTFParseException(ParseException): - pass + """Raised when an input cannot be parsed as an EDTF string. + + Attributes: + input_string - the input string that could not be parsed + err -- the original ParseException that caused this one + """ + + def __init__(self, input_string, err=None): + if input_string is None: + input_string = "" + self.input_string = input_string + if err is None: + err = ParseException(input_string, 0, "Invalid input or format.") + self.err = err + super().__init__(str(err), err.loc if err.loc else 0, self.input_string) + + def __str__(self): + if not self.input_string: + return "You must supply some input text" + near_text = ( + self.input_string[max(self.err.loc - 10, 0) : self.err.loc + 10] + if hasattr(self.err, "loc") + else "" + ) + return f"Error at position {self.err.loc}: Invalid input or format near '{near_text}'. Please provide a valid EDTF string." diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 730f47d..0624a92 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,3 +1,13 @@ +# ruff: noqa: E402 I001 + +# It's recommended to `enablePackrat()` immediately after importing pyparsing +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips + +import pyparsing +from edtf.appsettings import DEBUG_PYPARSING +from edtf.util import remapparams + +pyparsing.ParserElement.enable_packrat() from pyparsing import ( Combine, NotAny, @@ -9,10 +19,11 @@ Word, ZeroOrMore, nums, - oneOf, + one_of, ) from pyparsing import Literal as L + from edtf.parser.edtf_exceptions import EDTFParseException # (* ************************** Level 0 *************************** *) @@ -38,18 +49,19 @@ Unspecified, ) -oneThru12 = oneOf(["%.2d" % i for i in range(1, 13)]) -oneThru13 = oneOf(["%.2d" % i for i in range(1, 14)]) -oneThru23 = oneOf(["%.2d" % i for i in range(1, 24)]) -zeroThru23 = oneOf(["%.2d" % i for i in range(0, 24)]) -oneThru29 = oneOf(["%.2d" % i for i in range(1, 30)]) -oneThru30 = oneOf(["%.2d" % i for i in range(1, 31)]) -oneThru31 = oneOf(["%.2d" % i for i in range(1, 32)]) -oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) -zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) - -positiveDigit = Word(nums, exact=1, excludeChars="0") +oneThru12 = one_of([f"{i:02}" for i in range(1, 13)]) +oneThru13 = one_of([f"{i:02}" for i in range(1, 14)]) +oneThru23 = one_of([f"{i:02}" for i in range(1, 24)]) +zeroThru23 = one_of([f"{i:02}" for i in range(0, 24)]) +oneThru29 = one_of([f"{i:02}" for i in range(1, 30)]) +oneThru30 = one_of([f"{i:02}" for i in range(1, 31)]) +oneThru31 = one_of([f"{i:02}" for i in range(1, 32)]) +oneThru59 = one_of([f"{i:02}" for i in range(1, 60)]) +zeroThru59 = one_of([f"{i:02}" for i in range(0, 60)]) + digit = Word(nums, exact=1) +positiveDigit = Word(nums, exact=1, exclude_chars="0") +positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) second = zeroThru59 minute = zeroThru59 @@ -58,18 +70,23 @@ month = oneThru12("month") monthDay = ( - (oneOf("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) - ^ (oneOf("04 06 09 11")("month") + "-" + oneThru30("day")) + (one_of("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) + ^ (one_of("04 06 09 11")("month") + "-" + oneThru30("day")) ^ (L("02")("month") + "-" + oneThru29("day")) ) +# Significant digits suffix +significantDigits = "S" + Word(nums)("significant_digits") + # 4 digits, 0 to 9 positiveYear = Word(nums, exact=4) # Negative version of positive year, but "-0000" is illegal negativeYear = NotAny(L("-0000")) + ("-" + positiveYear) -year = Combine(positiveYear ^ negativeYear)("year") +year = Combine(positiveYear ^ negativeYear)("year") + Optional(significantDigits) +# simple version for Consecutives +year_basic = Combine(positiveYear ^ negativeYear)("year") yearMonth = year + "-" + month yearMonthDay = year + "-" + monthDay # o hai iso date @@ -77,15 +94,15 @@ date = Combine(year ^ yearMonth ^ yearMonthDay)("date") Date.set_parser(date) -zoneOffsetHour = oneThru13 -zoneOffset = L("Z") ^ ( +zone_offsetHour = oneThru13 +zone_offset = L("Z") ^ ( Regex("[+-]") - + (zoneOffsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) + + (zone_offsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) ) baseTime = Combine(hour + ":" + minute + ":" + second ^ "24:00:00") -time = Combine(baseTime + Optional(zoneOffset))("time") +time = Combine(baseTime + Optional(zone_offset))("time") dateAndTime = date + "T" + time DateAndTime.set_parser(dateAndTime) @@ -99,10 +116,10 @@ # (* ************************** Level 1 *************************** *) # (* ** Auxiliary Assignments for Level 1 ** *) -UASymbol = Combine(oneOf("? ~ %")) +UASymbol = Combine(one_of("? ~ %")) UA.set_parser(UASymbol) -seasonNumber = oneOf("21 22 23 24") +seasonNumber = one_of("21 22 23 24") # (* *** Season (unqualified) *** *) season = year + "-" + seasonNumber("season") @@ -112,9 +129,13 @@ # (* *** Long Year - Simple Form *** *) -longYearSimple = "Y" + Combine( - Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit) -)("year") +longYearSimple = ( + "Y" + + Combine(Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit))( + "year" + ) + + Optional(significantDigits) +) LongYear.set_parser(longYearSimple) # (* *** L1Interval *** *) @@ -131,9 +152,9 @@ def f(toks): l1Start = ".." ^ uaDateOrSeason -l1Start.addParseAction(f) +l1Start.add_parse_action(f) l1End = uaDateOrSeason ^ ".." -l1End.addParseAction(f) +l1End.add_parse_action(f) level1Interval = Optional(l1Start)("lower") + "/" + l1End("upper") ^ l1Start( "lower" @@ -141,17 +162,19 @@ def f(toks): Level1Interval.set_parser(level1Interval) # (* *** unspecified *** *) -yearWithOneOrTwoUnspecifedDigits = Combine(digit + digit + (digit ^ "X") + "X")("year") +yearWithOneOrTwoOrThreeUnspecifedDigits = Combine( + Optional("-") + digit + (digit ^ "X") + (digit ^ "X") + "X" +)("year") monthUnspecified = year + "-" + L("XX")("month") dayUnspecified = yearMonth + "-" + L("XX")("day") dayAndMonthUnspecified = year + "-" + L("XX")("month") + "-" + L("XX")("day") unspecified = ( - yearWithOneOrTwoUnspecifedDigits + yearWithOneOrTwoOrThreeUnspecifedDigits ^ monthUnspecified ^ dayUnspecified ^ dayAndMonthUnspecified -) +) + Optional(UASymbol)("ua") Unspecified.set_parser(unspecified) # (* *** uncertainOrApproxDate *** *) @@ -173,7 +196,7 @@ def f(toks): dayWithX = Combine(("X" + digitOrX) ^ (digitOrX + "X"))("day") # 2-digit month with at least one 'X' present -monthWithX = Combine(oneOf("0X 1X") ^ ("X" + digitOrX))("month") +monthWithX = Combine(one_of("0X 1X") ^ ("X" + digitOrX))("month") # 4-digit year with at least one 'X' present yearWithX = Combine( @@ -238,13 +261,12 @@ def f(toks): seasonQualified = season + "^" + seasonQualifier # (* ** Long Year - Scientific Form ** *) -positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) longYearScientific = ( "Y" + Combine(Optional("-") + positiveInteger)("base") + "E" + positiveInteger("exponent") - + Optional("S" + positiveInteger("precision")) + + Optional(significantDigits) ) ExponentialYear.set_parser(longYearScientific) @@ -260,15 +282,13 @@ def f(toks): ) Level2Interval.set_parser(level2Interval) -# (* ** Masked precision ** *) eliminated in latest specs -# maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year") -# MaskedPrecision.set_parser(maskedPrecision) - # (* ** Inclusive list and choice list** *) consecutives = ( (yearMonthDay("lower") + ".." + yearMonthDay("upper")) ^ (yearMonth("lower") + ".." + yearMonth("upper")) - ^ (year("lower") + ".." + year("upper")) + ^ ( + year_basic("lower") + ".." + year_basic("upper") + ) # using year_basic because some tests were throwing `'list' object has no attribute 'expandtabs'` - somewhere, pyparsing.parse_string() was being passed a list ) Consecutives.set_parser(consecutives) @@ -280,8 +300,8 @@ def f(toks): ^ consecutives ) -earlier = L("..").addParseAction(f)("lower") + date("upper").addParseAction(f) -later = date("lower").addParseAction(f) + L("..").addParseAction(f)("upper") +earlier = L("..").add_parse_action(f)("lower") + date("upper").add_parse_action(f) +later = date("lower").add_parse_action(f) + L("..").add_parse_action(f)("upper") EarlierConsecutives.set_parser(earlier) LaterConsecutives.set_parser(later) @@ -302,7 +322,9 @@ def f(toks): # (* *** L2 Season *** *) -seasonL2Number = oneOf("21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41") +seasonL2Number = one_of( + "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41" +) l2season = year + "-" + seasonL2Number("season") Level2Season.set_parser(l2season) @@ -323,14 +345,32 @@ def f(toks): ) -def parse_edtf(str, parseAll=True, fail_silently=False): +@remapparams(parseAll="parse_all") +def parse_edtf( + input_string: str, + parse_all: bool = True, + fail_silently: bool = False, + debug: bool | None = None, +): + if debug is None: + debug = DEBUG_PYPARSING + + if not input_string: + raise EDTFParseException(input_string) + try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) + p = edtfParser.parse_string(input_string.strip(), parse_all) if p: return p[0] + return None except ParseException as err: if fail_silently: return None - raise EDTFParseException(err) from err + if debug: + raise + raise EDTFParseException(input_string, err) from None + + +def is_valid_edtf(input_string: str) -> bool: + """Returns True if the input string was successfully parsed; False if it isn't.""" + return parse_edtf(input_string, fail_silently=True) is not None diff --git a/edtf/parser/grammar_test.py b/edtf/parser/grammar_test.py deleted file mode 100644 index c8ff727..0000000 --- a/edtf/parser/grammar_test.py +++ /dev/null @@ -1,360 +0,0 @@ -from pyparsing import ( - Combine, - NotAny, - OneOrMore, - Optional, - ParseException, - Regex, - Word, - ZeroOrMore, - nums, - oneOf, -) -from pyparsing import Literal as L - -from edtf.parser.edtf_exceptions import EDTFParseException - -# (* ************************** Level 0 *************************** *) -from edtf.parser.parser_classes import ( - UA, - Consecutives, - Date, - DateAndTime, - EarlierConsecutives, - ExponentialYear, - Interval, - LaterConsecutives, - Level1Interval, - Level2Interval, # , Testi - LongYear, - MultipleDates, - OneOfASet, - PartialUncertainOrApproximate, - PartialUnspecified, - Season, - UncertainOrApproximate, - Unspecified, -) - -oneThru12 = oneOf(["%.2d" % i for i in range(1, 13)]) -oneThru13 = oneOf(["%.2d" % i for i in range(1, 14)]) -oneThru23 = oneOf(["%.2d" % i for i in range(1, 24)]) -zeroThru23 = oneOf(["%.2d" % i for i in range(0, 24)]) -oneThru29 = oneOf(["%.2d" % i for i in range(1, 30)]) -oneThru30 = oneOf(["%.2d" % i for i in range(1, 31)]) -oneThru31 = oneOf(["%.2d" % i for i in range(1, 32)]) -oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) -zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) - -positiveDigit = Word(nums, exact=1, excludeChars="0") -digit = Word(nums, exact=1) - -second = zeroThru59 -minute = zeroThru59 -hour = zeroThru23 -day = oneThru31("day") - -month = oneThru12("month") -monthDay = ( - (oneOf("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) - ^ (oneOf("04 06 09 11")("month") + "-" + oneThru30("day")) - ^ (L("02")("month") + "-" + oneThru29("day")) -) - -# 4 digits, 0 to 9 -positiveYear = Word(nums, exact=4) - -# Negative version of positive year, but "-0000" is illegal -negativeYear = NotAny(L("-0000")) + ("-" + positiveYear) - -year = Combine(positiveYear ^ negativeYear)("year") - -yearMonth = year + "-" + month -yearMonthDay = year + "-" + monthDay # o hai iso date - -date = Combine(year ^ yearMonth ^ yearMonthDay)("date") -Date.set_parser(date) - -zoneOffsetHour = oneThru13 -zoneOffset = L("Z") ^ ( - Regex("[+-]") - + (zoneOffsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) -) - -baseTime = Combine(hour + ":" + minute + ":" + second ^ "24:00:00") - -time = Combine(baseTime + Optional(zoneOffset))("time") - -dateAndTime = date + "T" + time -DateAndTime.set_parser(dateAndTime) - -l0Interval = date("lower") + "/" + date("upper") -Interval.set_parser(l0Interval) - -level0Expression = date ^ dateAndTime ^ l0Interval - - -# (* ************************** Level 1 *************************** *) - -# (* ** Auxiliary Assignments for Level 1 ** *) -UASymbol = Combine(oneOf("? ~ %")) -UA.set_parser(UASymbol) - -seasonNumber = oneOf("21 22 23 24") - -# (* *** Season (unqualified) *** *) -season = year + "-" + seasonNumber("season") -Season.set_parser(season) - -dateOrSeason = date("") ^ season - -# (* *** Long Year - Simple Form *** *) - -longYearSimple = "Y" + Combine( - Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit) -)("year") -LongYear.set_parser(longYearSimple) - -# (* *** L1Interval *** *) -uaDateOrSeason = dateOrSeason + Optional(UASymbol) - - -# unspecifiedIntervalSec = L('..')('unknownOrOpen') + FollowedBy(L("/") + uaDateOrSeason)('other_section_element') -# Testi.set_parser(unspecifiedIntervalSec) - - -# bit of a kludge here to get the all the relevant tokens into the parse action -# cleanly otherwise the parameter names are overlapped. -def f(toks): - try: - return {"date": toks[0], "ua": toks[1]} - except IndexError: - return {"date": toks[0], "ua": None} - - -l1Start = ".." ^ uaDateOrSeason -# l1Start = unspecifiedIntervalSec ^ uaDateOrSeason -l1Start.addParseAction(f) -l1End = uaDateOrSeason ^ ".." -l1End.addParseAction(f) - -# level1Interval = l1Start("lower") + "/" + l1End("upper") -level1Interval = Optional(l1Start)("lower") + "/" + l1End("upper") ^ l1Start( - "lower" -) + "/" + Optional(l1End("upper")) -Level1Interval.set_parser(level1Interval) - -# (* *** unspecified *** *) -yearWithOneOrTwoUnspecifedDigits = Combine(digit + digit + (digit ^ "X") + "X")("year") -monthUnspecified = year + "-" + L("XX")("month") -dayUnspecified = yearMonth + "-" + L("XX")("day") -dayAndMonthUnspecified = year + "-" + L("XX")("month") + "-" + L("XX")("day") - -unspecified = ( - yearWithOneOrTwoUnspecifedDigits - ^ monthUnspecified - ^ dayUnspecified - ^ dayAndMonthUnspecified -) -Unspecified.set_parser(unspecified) - -# (* *** uncertainOrApproxDate *** *) - -uncertainOrApproxDate = date("date") + UASymbol("ua") -UncertainOrApproximate.set_parser(uncertainOrApproxDate) - -level1Expression = ( - uncertainOrApproxDate ^ unspecified ^ level1Interval ^ longYearSimple ^ season -) - -# (* ************************** Level 2 *************************** *) - -# (* ** Internal Unspecified** *) - -digitOrU = Word(nums + "X", exact=1) - -# 2-digit day with at least one 'X' present -dayWithU = Combine(("X" + digitOrU) ^ (digitOrU + "X"))("day") - -# 2-digit month with at least one 'X' present -monthWithU = Combine(oneOf("0X 1X") ^ ("X" + digitOrU))("month") - -# 4-digit year with at least one 'X' present -yearWithU = Combine( - ("X" + digitOrU + digitOrU + digitOrU) - ^ (digitOrU + "X" + digitOrU + digitOrU) - ^ (digitOrU + digitOrU + "X" + digitOrU) - ^ (digitOrU + digitOrU + digitOrU + "X") -)("year") - -yearMonthWithU = (Combine(year("") ^ yearWithU(""))("year") + "-" + monthWithU) ^ ( - yearWithU + "-" + month -) - -monthDayWithU = (Combine(month("") ^ monthWithU(""))("month") + "-" + dayWithU) ^ ( - monthWithU + "-" + day -) - -yearMonthDayWithU = ( - ( - yearWithU - + "-" - + Combine(month("") ^ monthWithU(""))("month") - + "-" - + Combine(day("") ^ dayWithU(""))("day") - ) - ^ (year + "-" + monthWithU + "-" + Combine(day("") ^ dayWithU(""))("day")) - ^ (year + "-" + month + "-" + dayWithU) -) - -partialUnspecified = yearWithU ^ yearMonthWithU ^ yearMonthDayWithU -PartialUnspecified.set_parser(partialUnspecified) - -# (* ** Internal Uncertain or Approximate** *) - -# this line is out of spec, but the given examples (e.g. '(2004)?-06-04~') -# appear to require it. -year_with_brackets = year ^ ("(" + year + ")") - -# second clause below needed Optional() around the "year_ua" UASymbol, for dates -# like '(2011)-06-04~' to work. - -IUABase = ( - ( - year_with_brackets - + UASymbol("year_ua") - + "-" - + month - + Optional("-(" + day + ")" + UASymbol("day_ua")) - ) - ^ ( - year_with_brackets - + Optional(UASymbol)("year_ua") - + "-" - + monthDay - + Optional(UASymbol)("month_day_ua") - ) - ^ ( - year_with_brackets - + Optional(UASymbol)("year_ua") - + "-(" - + month - + ")" - + UASymbol("month_ua") - + Optional("-(" + day + ")" + UASymbol("day_ua")) - ) - ^ ( - year_with_brackets - + Optional(UASymbol)("year_ua") - + "-(" - + month - + ")" - + UASymbol("month_ua") - + Optional("-" + day) - ) - ^ (yearMonth + UASymbol("year_month_ua") + "-(" + day + ")" + UASymbol("day_ua")) - ^ (yearMonth + UASymbol("year_month_ua") + "-" + day) - ^ (yearMonth + "-(" + day + ")" + UASymbol("day_ua")) - ^ (year + "-(" + monthDay + ")" + UASymbol("month_day_ua")) - ^ (season("ssn") + UASymbol("season_ua")) -) - -partialUncertainOrApproximate = IUABase ^ ("(" + IUABase + ")" + UASymbol("all_ua")) -PartialUncertainOrApproximate.set_parser(partialUncertainOrApproximate) - -dateWithInternalUncertainty = partialUncertainOrApproximate ^ partialUnspecified - -qualifyingString = Regex(r"\S") # any nonwhitespace char - -# (* ** SeasonQualified ** *) -seasonQualifier = qualifyingString -seasonQualified = season + "^" + seasonQualifier - -# (* ** Long Year - Scientific Form ** *) -positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) -longYearScientific = ( - "Y" - + Combine(Optional("-") + positiveInteger)("base") - + "E" - + positiveInteger("exponent") - + Optional("S" + positiveInteger("precision")) -) -ExponentialYear.set_parser(longYearScientific) - -# (* ** level2Interval ** *) -level2Interval = ( - (dateOrSeason("lower") + "/" + dateWithInternalUncertainty("upper")) - ^ (dateWithInternalUncertainty("lower") + "/" + dateOrSeason("upper")) - ^ ( - dateWithInternalUncertainty("lower") - + "/" - + dateWithInternalUncertainty("upper") - ) -) -Level2Interval.set_parser(level2Interval) - -# (* ** Masked precision ** *) eliminated in latest specs -# maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year") -# MaskedPrecision.set_parser(maskedPrecision) - -# (* ** Inclusive list and choice list** *) -consecutives = ( - (yearMonthDay("lower") + ".." + yearMonthDay("upper")) - ^ (yearMonth("lower") + ".." + yearMonth("upper")) - ^ (year("lower") + ".." + year("upper")) -) -Consecutives.set_parser(consecutives) - -listElement = ( - date - ^ dateWithInternalUncertainty - ^ uncertainOrApproxDate - ^ unspecified - ^ consecutives -) - -earlier = ".." + date("upper") -EarlierConsecutives.set_parser(earlier) -later = date("lower") + ".." -LaterConsecutives.set_parser(later) - -listContent = ( - (earlier + ZeroOrMore("," + listElement)) - ^ (Optional(earlier + ",") + ZeroOrMore(listElement + ",") + later) - ^ (listElement + OneOrMore("," + listElement)) - ^ consecutives -) - -choiceList = "[" + listContent + "]" -OneOfASet.set_parser(choiceList) - -inclusiveList = "{" + listContent + "}" -MultipleDates.set_parser(inclusiveList) - -level2Expression = ( - partialUncertainOrApproximate - ^ partialUnspecified - ^ choiceList - ^ inclusiveList - ^ level2Interval - ^ longYearScientific - ^ seasonQualified -) - -# putting it all together -edtfParser = ( - level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") -) - - -def parse_edtf(str, parseAll=True, fail_silently=False): - try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) - if p: - return p[0] - except ParseException as err: - if fail_silently: - return None - raise EDTFParseException(err) from err diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 2b4368a..67dd8ee 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1,9 +1,10 @@ import calendar import math -import re +from collections.abc import Callable from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta @@ -27,7 +28,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> int: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -49,7 +50,7 @@ def days_in_month(year, month): }[month] -def apply_delta(op, time_struct, delta): +def apply_delta(op: Callable, time_struct: struct_time, delta) -> struct_time: """ Apply a `relativedelta` to a `struct_time` data structure. @@ -75,9 +76,9 @@ def apply_delta(op, time_struct, delta): # Adjust the year to be close to the 2000 millenium in 1,000 year # increments to try and retain accurate relative leap years - actual_year = time_struct.tm_year - millenium = int(float(actual_year) / 1000) - millenium_diff = (2 - millenium) * 1000 + actual_year: int = time_struct.tm_year + millenium: int = int(float(actual_year) / 1000) + millenium_diff: int = (2 - millenium) * 1000 adjusted_year = actual_year + millenium_diff # Apply delta to the date/time with adjusted year dt = datetime(*(adjusted_year,) + time_struct[1:6]) @@ -91,16 +92,19 @@ def apply_delta(op, time_struct, delta): class EDTFObject: """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ parser = None + _is_approximate: bool + _is_uncertain: bool + _uncertain_and_approximate: bool @classmethod def set_parser(cls, p): cls.parser = p - p.addParseAction(cls.parse_action) + p.add_parse_action(cls.parse_action) @classmethod def parse_action(cls, toks): @@ -113,68 +117,69 @@ def parse_action(cls, toks): @classmethod def parse(cls, s): - return cls.parser.parseString(s)[0] + return cls.parser.parse_string(s)[0] - def __repr__(self): + def __repr__(self) -> str: return f"{type(self).__name__}: '{str(self)}'" - def __init__(self, *args, **kwargs): - str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{str} is not implemented.") + def __init__(self, *args, **kwargs) -> None: + message: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{message} is not implemented.") - def __str__(self): + def __str__(self) -> str: raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): raise NotImplementedError - def lower_strict(self): + def lower_strict(self) -> struct_time: return self._strict_date(lean=EARLIEST) - def upper_strict(self): + def upper_strict(self) -> struct_time: return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str) -> relativedelta: """ Subclasses should override this to pad based on how precise they are. """ - return relativedelta(0) + return relativedelta(None) - def get_is_approximate(self): + def get_is_approximate(self) -> bool: return getattr(self, "_is_approximate", False) - def set_is_approximate(self, val): + def set_is_approximate(self, val: bool) -> None: self._is_approximate = val - is_approximate = property(get_is_approximate, set_is_approximate) + is_approximate = property(get_is_approximate, set_is_approximate) # noqa - def get_is_uncertain(self): + def get_is_uncertain(self) -> bool: return getattr(self, "_is_uncertain", False) - def set_is_uncertain(self, val): + def set_is_uncertain(self, val: bool) -> None: self._is_uncertain = val - is_uncertain = property(get_is_uncertain, set_is_uncertain) + is_uncertain = property(get_is_uncertain, set_is_uncertain) # noqa - def get_is_uncertain_and_approximate(self): + def get_is_uncertain_and_approximate(self) -> bool: return getattr(self, "_uncertain_and_approximate", False) - def set_is_uncertain_and_approximate(self, val): + def set_is_uncertain_and_approximate(self, val: bool) -> None: self._uncertain_and_approximate = val is_uncertain_and_approximate = property( - get_is_uncertain_and_approximate, set_is_uncertain_and_approximate + get_is_uncertain_and_approximate, # noqa + set_is_uncertain_and_approximate, # noqa ) - def lower_fuzzy(self): + def lower_fuzzy(self) -> struct_time: strict_val = self.lower_strict() return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - def upper_fuzzy(self): + def upper_fuzzy(self) -> struct_time: strict_val = self.upper_strict() return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) == str(other) elif isinstance(other, date): @@ -183,7 +188,7 @@ def __eq__(self, other): return self._strict_date() == trim_struct_time(other) return False - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) != str(other) elif isinstance(other, date): @@ -192,7 +197,7 @@ def __ne__(self, other): return self._strict_date() != trim_struct_time(other) return True - def __gt__(self, other): + def __gt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() > other.lower_strict() elif isinstance(other, date): @@ -203,7 +208,7 @@ def __gt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __ge__(self, other): + def __ge__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() >= other.lower_strict() elif isinstance(other, date): @@ -214,7 +219,7 @@ def __ge__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __lt__(self, other): + def __lt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() < other.lower_strict() elif isinstance(other, date): @@ -225,7 +230,7 @@ def __lt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __le__(self, other): + def __le__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() <= other.lower_strict() elif isinstance(other, date): @@ -241,81 +246,132 @@ def __le__(self, other): class Date(EDTFObject): - def set_year(self, y): + def __init__( # noqa + self, + year: str | None = None, + month: str | None = None, + day: str | None = None, + significant_digits=None, + **kwargs, + ): + for param in ("date", "lower", "upper"): + if param in kwargs: + self.__init__(**kwargs[param]) + return + + self._year: str | None = ( + year # Year is required, but sometimes passed in as a 'date' dict. + ) + self._month: str | None = month + self._day: str | None = day + self.significant_digits: int | None = ( + int(significant_digits) if significant_digits else None + ) + + def set_year(self, y: str | None): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self): + def get_year(self) -> str | None: return self._year - year = property(get_year, set_year) + year = property(get_year, set_year) # noqa - def set_month(self, m): + def set_month(self, m: str | None): self._month = m if m is None: - self.day = None + self._day = None - def get_month(self): + def get_month(self) -> str | None: return self._month - month = property(get_month, set_month) + month = property(get_month, set_month) # noqa - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ("date", "lower", "upper"): - if param in kwargs: - self.__init__(**kwargs[param]) - return + def set_day(self, d: str | None): + self._day = d + if d is None: + self._day = None - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day + def get_day(self) -> str | None: + return self._day - def __str__(self): - r = self.year - if self.month: - r += f"-{self.month}" - if self.day: - r += f"-{self.day}" + day = property(get_day, set_day) # noqa + + def __str__(self) -> str: + r = f"{self._year}" + if self._month is not None: + r += f"-{self._month}" + if self._day is not None: + r += f"-{self._day}" + if self.significant_digits: + r += f"S{self.significant_digits}" return r - def isoformat(self, default=date.max): - return "%s-%02d-%02d" % ( - self.year, - int(self.month or default.month), - int(self.day or default.day), + def isoformat(self, default=date.max) -> str: + return f"{self._year}-{int(self._month or default.month):02d}-{int(self._day or default.day):02d}" + + def lower_fuzzy(self) -> struct_time: + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) + ) + + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 + insignificant_digits: int = total_digits - self.significant_digits + lower_year: int = ( + i_year // (10**insignificant_digits) * (10**insignificant_digits) ) + return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def upper_fuzzy(self) -> struct_time: + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + add, self.upper_strict(), self._get_fuzzy_padding(LATEST) + ) - def _precise_year(self, lean): + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 + insignificant_digits: int = total_digits - self.significant_digits + upper_year: int = (i_year // (10**insignificant_digits) + 1) * ( + 10**insignificant_digits + ) - 1 + return struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def _precise_year(self, lean: str) -> int: # Replace any ambiguous characters in the year string with 0s or 9s + if not self._year: + return 0 + if lean == EARLIEST: - return int(re.sub(r"X", r"0", self.year)) + rep = self._year.replace("X", "0") else: - return int(re.sub(r"X", r"9", self.year)) + rep = self._year.replace("X", "9") - def _precise_month(self, lean): - if self.month and self.month != "XX": + return int(rep) + + def _precise_month(self, lean: str) -> int: + if self._month and self._month != "XX": try: - return int(self.month) + return int(self._month) except ValueError as err: raise ValueError( - f"Couldn't convert {self.month} to int (in {self})" + f"Couldn't convert {self._month} to int (in {self})" ) from err - else: - return 1 if lean == EARLIEST else 12 + return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): - if not self.day or self.day == "XX": + def _precise_day(self, lean: str) -> int: + if not self._day or self._day == "XX": if lean == EARLIEST: return 1 else: return days_in_month( self._precise_year(LATEST), self._precise_month(LATEST) ) - else: - return int(self.day) + return int(self._day) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: """ Return a `time.struct_time` representation of the date. """ @@ -330,36 +386,39 @@ def _strict_date(self, lean): ) @property - def precision(self): - if self.day: + def precision(self) -> str: + if self._day: return PRECISION_DAY - if self.month: + if self._month: return PRECISION_MONTH return PRECISION_YEAR + def estimated(self) -> int: + return self._precise_year(EARLIEST) + class DateAndTime(EDTFObject): - def __init__(self, date, time): - self.date = date + def __init__(self, date: Date, time): # noqa: super raises not implemented + self.date: Date = date self.time = time - def __str__(self): + def __str__(self) -> str: return self.isoformat() - def isoformat(self): + def isoformat(self) -> str: return self.date.isoformat() + "T" + self.time - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: return self.date._strict_date(lean) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() == other.isoformat() elif isinstance(other, struct_time): return self._strict_date() == trim_struct_time(other) return super().__eq__(other) - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() != other.isoformat() elif isinstance(other, struct_time): @@ -368,22 +427,20 @@ def __ne__(self, other): class Interval(EDTFObject): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa: super() raises not implemented self.lower = lower self.upper = upper def __str__(self): return f"{self.lower}/{self.upper}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: if lean == EARLIEST: - r = self.lower._strict_date(lean) - else: - r = self.upper._strict_date(lean) - return r + return self.lower._strict_date(lean) + return self.upper._strict_date(lean) @property - def precision(self): + def precision(self) -> int | None: if self.lower.precision == self.upper.precision: return self.lower.precision return None @@ -398,63 +455,79 @@ def parse_action(cls, toks): args = toks.asList() return cls(*args) - def __init__(self, *args): + def __init__(self, *args) -> None: # noqa: super() raises not implemented if len(args) != 1: raise AssertionError("UA must have exactly one argument") ua = args[0] - self.is_uncertain = "?" in ua - self.is_approximate = "~" in ua - self.is_uncertain_and_approximate = "%" in ua + self.is_uncertain: bool = "?" in ua + self.is_approximate: bool = "~" in ua + self.is_uncertain_and_approximate: bool = "%" in ua - def __str__(self): - d = "" + def __str__(self) -> str: if self.is_uncertain: - d += "?" - if self.is_approximate: - d += "~" - if self.is_uncertain_and_approximate: - d += "%" - return d + return "?" + elif self.is_approximate: + return "~" + elif self.is_uncertain_and_approximate: + return "%" + return "" - def _get_multiplier(self): + def _get_multiplier(self) -> float | None: if self.is_uncertain_and_approximate: return appsettings.MULTIPLIER_IF_BOTH elif self.is_uncertain: return appsettings.MULTIPLIER_IF_UNCERTAIN elif self.is_approximate: return appsettings.MULTIPLIER_IF_APPROXIMATE + return None class UncertainOrApproximate(EDTFObject): - def __init__(self, date, ua): + def __init__(self, date, ua): # noqa: super() raises not implemented self.date = date self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) - def __str__(self): + def __str__(self) -> str: if self.ua: return f"{self.date}{self.ua}" - else: - return str(self.date) + return str(self.date) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> tuple: return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): if not self.ua: - return relativedelta(0) + return relativedelta() multiplier = self.ua._get_multiplier() + padding = relativedelta() + + # Check the presence of uncertainty on each component + # self.precision not helpful here: + # L1 qualified EDTF dates apply qualification across all parts of the date + if self.date.year: + padding += relativedelta( + years=int(multiplier * appsettings.PADDING_YEAR_PRECISION.years) + ) + if self.date.month: + padding += relativedelta( + months=int(multiplier * appsettings.PADDING_MONTH_PRECISION.months) + ) + if self.date.day: + padding += relativedelta( + days=int(multiplier * appsettings.PADDING_DAY_PRECISION.days) + ) - if self.date.precision == PRECISION_DAY: - return multiplier * appsettings.PADDING_DAY_PRECISION - elif self.date.precision == PRECISION_MONTH: - return multiplier * appsettings.PADDING_MONTH_PRECISION - elif self.date.precision == PRECISION_YEAR: - return multiplier * appsettings.PADDING_YEAR_PRECISION + return padding class UnspecifiedIntervalSection(EDTFObject): - def __init__(self, sectionOpen=False, other_section_element=None): + def __init__(self, sectionOpen=False, other_section_element=None): # noqa: super() raises not implemented if sectionOpen: self.is_open = True self.is_unknown = False @@ -466,22 +539,25 @@ def __init__(self, sectionOpen=False, other_section_element=None): def __str__(self): if self.is_unknown: return "" - else: - return ".." + return ".." + + def _strict_date(self, lean: str = EARLIEST) -> float | None: + if lean not in (EARLIEST, LATEST): + raise ValueError("lean must be one of EARLIEST or LATEST") - def _strict_date(self, lean): if lean == EARLIEST: if self.is_unknown: upper = self.other._strict_date(LATEST) return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) else: return -math.inf - else: + elif lean == LATEST: if self.is_unknown: lower = self.other._strict_date(EARLIEST) return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) else: return math.inf + return None @property def precision(self): @@ -489,11 +565,156 @@ def precision(self): class Unspecified(Date): - pass + def __init__( + self, + year=None, + month=None, + day=None, + significant_digits=None, + ua=None, + **kwargs, + ): + super().__init__( + year=year, + month=month, + day=day, + significant_digits=significant_digits, + **kwargs, + ) + self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) + self.negative = self.year.startswith("-") + + def __str__(self): + base = super().__str__() + if self.ua: + base += str(self.ua) + return base + + def _get_fuzzy_padding(self, lean): + if not self.ua: + return relativedelta() + multiplier = self.ua._get_multiplier() + padding = relativedelta() + + if self.year: + years_padding = self._years_padding(multiplier) + padding += years_padding + if self.month: + padding += relativedelta( + months=int(multiplier * appsettings.PADDING_MONTH_PRECISION.months) + ) + if self.day: + padding += relativedelta( + days=int(multiplier * appsettings.PADDING_DAY_PRECISION.days) + ) + return padding + + def _years_padding(self, multiplier): + """Calculate year padding based on the precision.""" + precision_settings = { + PRECISION_MILLENIUM: appsettings.PADDING_MILLENNIUM_PRECISION.years, + PRECISION_CENTURY: appsettings.PADDING_CENTURY_PRECISION.years, + PRECISION_DECADE: appsettings.PADDING_DECADE_PRECISION.years, + PRECISION_YEAR: appsettings.PADDING_YEAR_PRECISION.years, + } + years = precision_settings.get(self.precision, 0) + return relativedelta(years=int(multiplier * years)) + + def lower_fuzzy(self): + strict_val = ( + self.lower_strict() + ) # negative handled in the lower_strict() override + adjusted = apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + return adjusted + + def upper_fuzzy(self): + strict_val = ( + self.upper_strict() + ) # negative handled in the upper_strict() override + + adjusted = apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + return adjusted + + def lower_strict(self): + if self.negative: + strict_val = self._strict_date( + lean=LATEST + ) # gets the year right, but need to adjust day and month + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 1, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + + return self._strict_date(lean=EARLIEST) + + def upper_strict(self) -> struct_time: + if self.negative: + strict_val = self._strict_date(lean=EARLIEST) + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 12, 31) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + days_in_month = calendar.monthrange( + strict_val.tm_year, strict_val.tm_mon + )[1] + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, days_in_month) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + return self._strict_date(lean=LATEST) + + @property + def precision(self): + if self.day: + return PRECISION_DAY + if self.month: + return PRECISION_MONTH + if self.year: + year_no_symbol = self.year.lstrip("-") + if year_no_symbol.isdigit(): + return PRECISION_YEAR + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XXX"): + return PRECISION_MILLENIUM + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XX"): + return PRECISION_CENTURY + if len(year_no_symbol) == 4 and year_no_symbol.endswith("X"): + return PRECISION_DECADE + raise ValueError(f"Unspecified date {self} has no precision") class Level1Interval(Interval): - def __init__(self, lower=None, upper=None): + def __init__(self, lower: Optional[dict] = None, upper: Optional[dict] = None): # noqa if lower: if lower["date"] == "..": self.lower = UnspecifiedIntervalSection( @@ -516,74 +737,122 @@ def __init__(self, lower=None, upper=None): self.upper = UnspecifiedIntervalSection( False, UncertainOrApproximate(**lower) ) + self.is_approximate: bool = ( + self.lower.is_approximate or self.upper.is_approximate + ) + self.is_uncertain: bool = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean) -> relativedelta | None: if lean == EARLIEST: return self.lower._get_fuzzy_padding(lean) elif lean == LATEST: return self.upper._get_fuzzy_padding(lean) + return None class LongYear(EDTFObject): - def __init__(self, year): - self.year = year + def __init__(self, year: str, significant_digits: str | None = None): # noqa + self.year: str = year + self.significant_digits: int | None = ( + int(significant_digits) if significant_digits else None + ) - def __str__(self): + def __str__(self) -> str: + if self.significant_digits: + return f"Y{self.year}S{self.significant_digits}" return f"Y{self.year}" - def _precise_year(self): + def _precise_year(self) -> int: return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def estimated(self) -> int: + return self._precise_year() + + def lower_fuzzy(self) -> struct_time: + full_year = self._precise_year() + strict_val = self.lower_strict() + if not self.significant_digits: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + + insignificant_digits = len(str(full_year)) - int(self.significant_digits) + if insignificant_digits <= 0: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + lower_year = sig_digits * padding_value + return apply_delta( + sub, + struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(EARLIEST), + ) + + def upper_fuzzy(self) -> struct_time: + full_year = self._precise_year() + strict_val = self.upper_strict() + if not self.significant_digits: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) else: - return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + insignificant_digits = len(str(full_year)) - self.significant_digits + if insignificant_digits <= 0: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + upper_year = (sig_digits + 1) * padding_value - 1 + return apply_delta( + add, + struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(LATEST), + ) class Season(Date): - def __init__(self, year, season, **kwargs): + def __init__(self, year, season, **kwargs): # noqa self.year = year self.season = season # use season to look up month # day isn't part of the 'season' spec, but it helps the inherited # `Date` methods do their thing. self.day = None - def __str__(self): + def __str__(self) -> str: return f"{self.year}-{self.season}" - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: rng = appsettings.SEASON_L2_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) class PartialUncertainOrApproximate(Date): - def set_year(self, y): # Year can be None. - self._year = y - - year = property(Date.get_year, set_year) - - def __init__( + def __init__( # noqa self, year=None, month=None, day=None, - year_ua=False, - month_ua=False, - day_ua=False, - year_month_ua=False, - month_day_ua=False, + year_ua: UA | None = None, + month_ua: UA | None = None, + day_ua: UA | None = None, + year_month_ua: UA | None = None, + month_day_ua: UA | None = None, ssn=None, - season_ua=False, - all_ua=False, - year_ua_b=False, + season_ua: UA | None = None, + all_ua: UA | None = None, + year_ua_b: UA | None = None, ): self.year = year self.month = month @@ -602,7 +871,29 @@ def __init__( self.all_ua = all_ua - def __str__(self): + uas = [ + year_ua, + year_ua_b, + month_ua, + day_ua, + year_month_ua, + month_day_ua, + season_ua, + all_ua, + ] + self.is_uncertain: bool = any( + item.is_uncertain for item in uas if hasattr(item, "is_uncertain") + ) + self.is_approximate: bool = any( + item.is_approximate for item in uas if hasattr(item, "is_approximate") + ) + self.is_uncertain_and_approximate: bool = any( + item.is_uncertain_and_approximate + for item in uas + if hasattr(item, "is_uncertain_and_approximate") + ) + + def __str__(self) -> str: if self.season_ua: return f"{self.season}{self.season_ua}" @@ -611,7 +902,10 @@ def __str__(self): else: y = f"{self.year_ua_b}{self.year}" if self.year_ua_b else str(self.year) - m = f"{self.month_ua}{self.month}" if self.month_ua else str(self.month) + if self.month: + m = f"{self.month_ua}{self.month}" if self.month_ua else str(self.month) + else: + m = None if self.day: d = f"{self.day_ua}{self.day}" if self.day_ua else str(self.day) @@ -627,35 +921,45 @@ def __str__(self): else: result = f"{y}-({m}-{d}){self.month_day_ua}" else: - result = f"{y}-{m}-{d}" if d else f"{y}-{m}" + if d: + result = f"{y}-{m}-{d}" + elif m: + result = f"{y}-{m}" + else: + result = y if self.all_ua: result = f"({result}){self.all_ua}" return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + + year = property(Date.get_year, set_year) # noqa + + def _precise_year(self, lean: str) -> int: if self.season: return self.season._precise_year(lean) return super()._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: if self.season: return self.season._precise_month(lean) return super()._precise_month(lean) - def _precise_day(self, lean): + def _precise_day(self, lean: str) -> int: if self.season: return self.season._precise_day(lean) return super()._precise_day(lean) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str) -> struct_time: """ This is not a perfect interpretation as fuzziness is introduced for redundant uncertainly modifiers e.g. (2006~)~ will get two sets of fuzziness. """ - result = relativedelta(0) + result = relativedelta(None) if self.year_ua: result += ( @@ -717,7 +1021,7 @@ class PartialUnspecified(Unspecified): class Consecutives(Interval): # Treating Consecutive ranges as intervals where one bound is optional - def __init__(self, lower=None, upper=None): + def __init__(self, lower=None, upper=None): # noqa if lower and not isinstance(lower, EDTFObject): self.lower = Date.parse(lower) else: @@ -728,33 +1032,34 @@ def __init__(self, lower=None, upper=None): else: self.upper = upper - def __str__(self): - return "{}..{}".format(self.lower or "", self.upper or "") + def __str__(self) -> str: + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" class LaterConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" class OneOfASet(EDTFObject): + def __init__(self, *args): # noqa + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "[{}]".format(", ".join([str(o) for o in self.objects])) + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"[{out}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> float: strict_dates = [x._strict_date(lean) for x in self.objects] # Accounting for possible 'inf' and '-inf' values if lean == LATEST: @@ -776,61 +1081,69 @@ def _strict_date(self, lean): class MultipleDates(EDTFObject): + def __init__(self, *args): # noqa + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "{{{}}}".format(", ".join([str(o) for o in self.objects])) + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"{{{out}}}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> float: if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) - - -class MaskedPrecision(Date): - pass + return min([x._strict_date(lean) for x in self.objects]) class Level2Interval(Level1Interval): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. - if isinstance(lower, (tuple, list)) and len(lower) == 1: + if isinstance(lower, tuple | list) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower - if isinstance(lower, (tuple, list)) and len(upper) == 1: + + if isinstance(lower, tuple | list) and len(upper) == 1: self.upper = upper[0] else: self.upper = upper + self.is_approximate = self.lower.is_approximate or self.upper.is_approximate + self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) + class Level2Season(Season): pass class ExponentialYear(LongYear): - def __init__(self, base, exponent, precision=None): + def __init__(self, base, exponent, significant_digits=None): # noqa self.base = base self.exponent = exponent - self.precision = precision + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) - def _precise_year(self): + def _precise_year(self) -> int: return int(self.base) * 10 ** int(self.exponent) - def get_year(self): - if self.precision: - return f"{self.base}E{self.exponent}S{self.precision}" - else: - return f"{self.base}E{self.exponent}" + def get_year(self) -> str: + if self.significant_digits: + return f"{self.base}E{self.exponent}S{self.significant_digits}" + return f"{self.base}E{self.exponent}" + + year = property(get_year) # noqa - year = property(get_year) + def estimated(self) -> int: + return self._precise_year() diff --git a/edtf/parser/parser_classes_tests.py b/edtf/parser/parser_classes_tests.py deleted file mode 100644 index 857d0f6..0000000 --- a/edtf/parser/parser_classes_tests.py +++ /dev/null @@ -1,834 +0,0 @@ -# ruff: noqa: S101 # Asserts are ok in tests - -import calendar -import re -from datetime import date, datetime -from operator import add, sub -from time import struct_time - -from dateutil.relativedelta import relativedelta - -from edtf import appsettings -from edtf.convert import ( - TIME_EMPTY_EXTRAS, - TIME_EMPTY_TIME, - dt_to_struct_time, - trim_struct_time, -) - -EARLIEST = "earliest" -LATEST = "latest" - -PRECISION_MILLENIUM = "millenium" -PRECISION_CENTURY = "century" -PRECISION_DECADE = "decade" -PRECISION_YEAR = "year" -PRECISION_MONTH = "month" -PRECISION_SEASON = "season" -PRECISION_DAY = "day" - - -def days_in_month(year, month): - """ - Return the number of days in the given year and month, where month is - 1=January to 12=December, and respecting leap years as identified by - `calendar.isleap()` - """ - return { - 1: 31, - 2: 29 if calendar.isleap(year) else 28, - 3: 31, - 4: 30, - 5: 31, - 6: 30, - 7: 31, - 8: 31, - 9: 30, - 10: 31, - 11: 30, - 12: 31, - }[month] - - -def apply_delta(op, time_struct, delta): - """ - Apply a `relativedelta` to a `struct_time` data structure. - - `op` is an operator function, probably always `add` or `sub`tract to - correspond to `a_date + a_delta` and `a_date - a_delta`. - - This function is required because we cannot use standard `datetime` module - objects for conversion when the date/time is, or will become, outside the - boundary years 1 AD to 9999 AD. - """ - if not delta: - return time_struct # No work to do - - try: - dt_result = op(datetime(*time_struct[:6]), delta) - return dt_to_struct_time(dt_result) - except (OverflowError, ValueError): - # Year is not within supported 1 to 9999 AD range - pass - - # Here we fake the year to one in the acceptable range to avoid having to - # write our own date rolling logic - - # Adjust the year to be close to the 2000 millenium in 1,000 year - # increments to try and retain accurate relative leap years - actual_year = time_struct.tm_year - millenium = int(float(actual_year) / 1000) - millenium_diff = (2 - millenium) * 1000 - adjusted_year = actual_year + millenium_diff - # Apply delta to the date/time with adjusted year - dt = datetime(*(adjusted_year,) + time_struct[1:6]) - dt_result = op(dt, delta) - # Convert result year back to its original millenium - final_year = dt_result.year - millenium_diff - return struct_time( - (final_year,) + dt_result.timetuple()[1:6] + tuple(TIME_EMPTY_EXTRAS) - ) - - -class EDTFObject: - """ - Object to attact to a parser to become instantiated when the parser - completes. - """ - - parser = None - - @classmethod - def set_parser(cls, p): - cls.parser = p - p.addParseAction(cls.parse_action) - - @classmethod - def parse_action(cls, toks): - kwargs = toks.asDict() - try: - return cls(**kwargs) # replace the token list with the class - except Exception as e: - print(f"trying to {cls.__name__}.__init__(**{kwargs})") - raise e - - @classmethod - def parse(cls, s): - return cls.parser.parseString(s)[0] - - def __repr__(self): - return f"{type(self).__name__}: '{str(self)}'" - - def __init__(self, *args, **kwargs): - str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{str} is not implemented.") - - def __str__(self): - raise NotImplementedError - - def _strict_date(self, lean): - raise NotImplementedError - - def lower_strict(self): - return self._strict_date(lean=EARLIEST) - - def upper_strict(self): - return self._strict_date(lean=LATEST) - - def _get_fuzzy_padding(self, lean): - """ - Subclasses should override this to pad based on how precise they are. - """ - return relativedelta(0) - - def get_is_approximate(self): - return getattr(self, "_is_approximate", False) - - def set_is_approximate(self, val): - self._is_approximate = val - - is_approximate = property(get_is_approximate, set_is_approximate) - - def get_is_uncertain(self): - return getattr(self, "_is_uncertain", False) - - def set_is_uncertain(self, val): - self._is_uncertain = val - - is_uncertain = property(get_is_uncertain, set_is_uncertain) - - def get_is_uncertain_and_approximate(self): - return getattr(self, "_uncertain_and_approximate", False) - - def set_is_uncertain_and_approximate(self, val): - self._uncertain_and_approximate = val - - is_uncertain_and_approximate = property( - get_is_uncertain_and_approximate, set_is_uncertain_and_approximate - ) - - def lower_fuzzy(self): - strict_val = self.lower_strict() - return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - - def upper_fuzzy(self): - strict_val = self.upper_strict() - return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) - - def __eq__(self, other): - if isinstance(other, EDTFObject): - return str(self) == str(other) - elif isinstance(other, date): - return str(self) == other.isoformat() - elif isinstance(other, struct_time): - return self._strict_date() == trim_struct_time(other) - return False - - def __ne__(self, other): - if isinstance(other, EDTFObject): - return str(self) != str(other) - elif isinstance(other, date): - return str(self) != other.isoformat() - elif isinstance(other, struct_time): - return self._strict_date() != trim_struct_time(other) - return True - - def __gt__(self, other): - if isinstance(other, EDTFObject): - return self.lower_strict() > other.lower_strict() - elif isinstance(other, date): - return self.lower_strict() > dt_to_struct_time(other) - elif isinstance(other, struct_time): - return self.lower_strict() > trim_struct_time(other) - raise TypeError( - f"can't compare {type(self).__name__} with {type(other).__name__}" - ) - - def __ge__(self, other): - if isinstance(other, EDTFObject): - return self.lower_strict() >= other.lower_strict() - elif isinstance(other, date): - return self.lower_strict() >= dt_to_struct_time(other) - elif isinstance(other, struct_time): - return self.lower_strict() >= trim_struct_time(other) - raise TypeError( - f"can't compare {type(self).__name__} with {type(other).__name__}" - ) - - def __lt__(self, other): - if isinstance(other, EDTFObject): - return self.lower_strict() < other.lower_strict() - elif isinstance(other, date): - return self.lower_strict() < dt_to_struct_time(other) - elif isinstance(other, struct_time): - return self.lower_strict() < trim_struct_time(other) - raise TypeError( - f"can't compare {type(self).__name__} with {type(other).__name__}" - ) - - def __le__(self, other): - if isinstance(other, EDTFObject): - return self.lower_strict() <= other.lower_strict() - elif isinstance(other, date): - return self.lower_strict() <= dt_to_struct_time(other) - elif isinstance(other, struct_time): - return self.lower_strict() <= trim_struct_time(other) - raise TypeError( - f"can't compare {type(self).__name__} with {type(other).__name__}" - ) - - -# (* ************************** Level 0 *************************** *) - - -class Date(EDTFObject): - def set_year(self, y): - if y is None: - raise AttributeError("Year must not be None") - self._year = y - - def get_year(self): - return self._year - - year = property(get_year, set_year) - - def set_month(self, m): - self._month = m - if m is None: - self.day = None - - def get_month(self): - return self._month - - month = property(get_month, set_month) - - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ("date", "lower", "upper"): - if param in kwargs: - self.__init__(**kwargs[param]) - return - - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day - - def __str__(self): - r = self.year - if self.month: - r += f"-{self.month}" - if self.day: - r += f"-{self.day}" - return r - - def isoformat(self, default=date.max): - return "%s-%02d-%02d" % ( - self.year, - int(self.month or default.month), - int(self.day or default.day), - ) - - def _precise_year(self, lean): - # Replace any ambiguous characters in the year string with 0s or 9s - if lean == EARLIEST: - return int(re.sub(r"X", r"0", self.year)) - else: - return int(re.sub(r"X", r"9", self.year)) - - def _precise_month(self, lean): - if self.month and self.month != "XX": - try: - return int(self.month) - except ValueError as err: - raise ValueError( - f"Couldn't convert {self.month} to int (in {self})" - ) from err - else: - return 1 if lean == EARLIEST else 12 - - def _precise_day(self, lean): - if not self.day or self.day == "XX": - if lean == EARLIEST: - return 1 - else: - return days_in_month( - self._precise_year(LATEST), self._precise_month(LATEST) - ) - else: - return int(self.day) - - def _strict_date(self, lean): - """ - Return a `time.struct_time` representation of the date. - """ - return struct_time( - ( - self._precise_year(lean), - self._precise_month(lean), - self._precise_day(lean), - ) - + tuple(TIME_EMPTY_TIME) - + tuple(TIME_EMPTY_EXTRAS) - ) - - @property - def precision(self): - if self.day: - return PRECISION_DAY - if self.month: - return PRECISION_MONTH - return PRECISION_YEAR - - -class DateAndTime(EDTFObject): - def __init__(self, date, time): - self.date = date - self.time = time - - def __str__(self): - return self.isoformat() - - def isoformat(self): - return self.date.isoformat() + "T" + self.time - - def _strict_date(self, lean): - return self.date._strict_date(lean) - - def __eq__(self, other): - if isinstance(other, datetime): - return self.isoformat() == other.isoformat() - elif isinstance(other, struct_time): - return self._strict_date() == trim_struct_time(other) - return super().__eq__(other) - - def __ne__(self, other): - if isinstance(other, datetime): - return self.isoformat() != other.isoformat() - elif isinstance(other, struct_time): - return self._strict_date() != trim_struct_time(other) - return super().__ne__(other) - - -class Interval(EDTFObject): - def __init__(self, lower, upper): - self.lower = lower - self.upper = upper - - def __str__(self): - return f"{self.lower}/{self.upper}" - - def _strict_date(self, lean): - if lean == EARLIEST: - try: - r = self.lower._strict_date(lean) - if r is None: - raise AttributeError - return r - except ( - AttributeError - ): # it's a string, or no date. Result depends on the upper date - upper = self.upper._strict_date(LATEST) - return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) - else: - try: - r = self.upper._strict_date(lean) - if r is None: - raise AttributeError - return r - except ( - AttributeError - ): # an 'unknown' or 'open' string - depends on the lower date - if self.upper and (self.upper == "open" or self.upper.date == "open"): - return dt_to_struct_time(date.today()) # it's still happening - else: - lower = self.lower._strict_date(EARLIEST) - return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) - - -# (* ************************** Level 1 *************************** *) - - -class UA(EDTFObject): - @classmethod - def parse_action(cls, toks): - args = toks.asList() - return cls(*args) - - def __init__(self, *args): - assert len(args) == 1 - ua = args[0] - - self.is_uncertain = "?" in ua - self.is_approximate = "~" in ua - self.is_uncertain_and_approximate = "%" in ua - - def __str__(self): - d = "" - if self.is_uncertain: - d += "?" - if self.is_approximate: - d += "~" - if self.is_uncertain_and_approximate: - d += "%" - return d - - def _get_multiplier(self): - if self.is_uncertain_and_approximate: - return appsettings.MULTIPLIER_IF_BOTH - elif self.is_uncertain: - return appsettings.MULTIPLIER_IF_UNCERTAIN - elif self.is_approximate: - return appsettings.MULTIPLIER_IF_APPROXIMATE - - -class UncertainOrApproximate(EDTFObject): - def __init__(self, date, ua): - self.date = date - self.ua = ua - - def __str__(self): - if self.ua: - return f"{self.date}{self.ua}" - else: - return str(self.date) - - def _strict_date(self, lean): - if self.date == "open": - return None # depends on the other date - return dt_to_struct_time(date.today()) - if self.date == "unknown": - return None # depends on the other date - return self.date._strict_date(lean) - - def _get_fuzzy_padding(self, lean): - if not self.ua: - return relativedelta(0) - multiplier = self.ua._get_multiplier() - - if self.date.precision == PRECISION_DAY: - return multiplier * appsettings.PADDING_DAY_PRECISION - elif self.date.precision == PRECISION_MONTH: - return multiplier * appsettings.PADDING_MONTH_PRECISION - elif self.date.precision == PRECISION_YEAR: - return multiplier * appsettings.PADDING_YEAR_PRECISION - - -class Testi(EDTFObject): - # @classmethod - # def parse_action(cls, toks): - # args = toks.asList() - # return cls(*args) - - def __init__(self, **args): - print(args) - - -class UnspecifiedIntervalSection(EDTFObject): - def __init__(self, sectionOpen=False, other_section_element=None): - if sectionOpen: - self.is_open = True - self.is_unknown = False - else: - self.is_open = False - self.is_unknown = True - self.other = other_section_element - - def __str__(self): - if self.is_unknown: - return "" - else: - return ".." - - def _strict_date(self, lean): - if lean == EARLIEST: - if self.is_unknown: - upper = self.other._strict_date(LATEST) - return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) - else: - return dt_to_struct_time( - date.min - ) # from the beginning of time; *ahem, i mean python datetime - else: - if self.is_unknown: - lower = self.other._strict_date(EARLIEST) - return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) - else: - return dt_to_struct_time(date.max) # to then end of python datetime - - -class Unspecified(Date): - pass - - -class Level1Interval(Interval): - def __init__(self, lower=None, upper=None): - if lower: - if lower["date"] == "..": - self.lower = UnspecifiedIntervalSection( - True, UncertainOrApproximate(**upper) - ) - else: - self.lower = UncertainOrApproximate(**lower) - else: - self.lower = UnspecifiedIntervalSection( - False, UncertainOrApproximate(**upper) - ) - if upper: - if upper["date"] == "..": - self.upper = UnspecifiedIntervalSection( - True, UncertainOrApproximate(**lower) - ) - else: - self.upper = UncertainOrApproximate(**upper) - else: - self.upper = UnspecifiedIntervalSection( - False, UncertainOrApproximate(**lower) - ) - - def _get_fuzzy_padding(self, lean): - if lean == EARLIEST: - return self.lower._get_fuzzy_padding(lean) - elif lean == LATEST: - return self.upper._get_fuzzy_padding(lean) - - -class LongYear(EDTFObject): - def __init__(self, year): - self.year = year - - def __str__(self): - return f"Y{self.year}" - - def _precise_year(self): - return int(self.year) - - def _strict_date(self, lean): - py = self._precise_year() - if lean == EARLIEST: - return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - - -class Season(Date): - def __init__(self, year, season, **kwargs): - self.year = year - self.season = season # use season to look up month - # day isn't part of the 'season' spec, but it helps the inherited - # `Date` methods do their thing. - self.day = None - - def __str__(self): - return f"{self.year}-{self.season}" - - def _precise_month(self, lean): - rng = appsettings.SEASON_MONTHS_RANGE[int(self.season)] - if lean == EARLIEST: - return rng[0] - else: - return rng[1] - - -# (* ************************** Level 2 *************************** *) - - -class PartialUncertainOrApproximate(Date): - def set_year(self, y): # Year can be None. - self._year = y - - year = property(Date.get_year, set_year) - - def __init__( - self, - year=None, - month=None, - day=None, - year_ua=False, - month_ua=False, - day_ua=False, - year_month_ua=False, - month_day_ua=False, - ssn=None, - season_ua=False, - all_ua=False, - ): - self.year = year - self.month = month - self.day = day - - self.year_ua = year_ua - self.month_ua = month_ua - self.day_ua = day_ua - - self.year_month_ua = year_month_ua - self.month_day_ua = month_day_ua - - self.season = ssn - self.season_ua = season_ua - - self.all_ua = all_ua - - def __str__(self): - if self.season_ua: - return f"{self.season}{self.season_ua}" - - y = f"{self.year}{self.year_ua}" if self.year_ua else str(self.year) - - m = f"({self.month}){self.month_ua}" if self.month_ua else str(self.month) - - if self.day: - d = f"({self.day}){self.day_ua}" if self.day_ua else str(self.day) - else: - d = None - - if self.year_month_ua: # year/month approximate. No brackets needed. - ym = f"{y}-{m}{self.year_month_ua}" - result = f"{ym}-{d}" if d else ym - elif self.month_day_ua: - if self.year_ua: # we don't need the brackets round month and day - result = f"{y}-{m}-{d}{self.month_day_ua}" - else: - result = f"{y}-({m}-{d}){self.month_day_ua}" - else: - result = f"{y}-{m}-{d}" if d else f"{y}-{m}" - - if self.all_ua: - result = f"({result}){self.all_ua}" - - return result - - def _precise_year(self, lean): - if self.season: - return self.season._precise_year(lean) - return super()._precise_year(lean) - - def _precise_month(self, lean): - if self.season: - return self.season._precise_month(lean) - return super()._precise_month(lean) - - def _precise_day(self, lean): - if self.season: - return self.season._precise_day(lean) - return super()._precise_day(lean) - - def _get_fuzzy_padding(self, lean): - """ - This is not a perfect interpretation as fuzziness is introduced for - redundant uncertainly modifiers e.g. (2006~)~ will get two sets of - fuzziness. - """ - result = relativedelta(0) - - if self.year_ua: - result += ( - appsettings.PADDING_YEAR_PRECISION * self.year_ua._get_multiplier() - ) - if self.month_ua: - result += ( - appsettings.PADDING_MONTH_PRECISION * self.month_ua._get_multiplier() - ) - if self.day_ua: - result += appsettings.PADDING_DAY_PRECISION * self.day_ua._get_multiplier() - - if self.year_month_ua: - result += ( - appsettings.PADDING_YEAR_PRECISION - * self.year_month_ua._get_multiplier() - ) - result += ( - appsettings.PADDING_MONTH_PRECISION - * self.year_month_ua._get_multiplier() - ) - if self.month_day_ua: - result += ( - appsettings.PADDING_DAY_PRECISION * self.month_day_ua._get_multiplier() - ) - result += ( - appsettings.PADDING_MONTH_PRECISION - * self.month_day_ua._get_multiplier() - ) - - if self.season_ua: - result += ( - appsettings.PADDING_SEASON_PRECISION * self.season_ua._get_multiplier() - ) - - if self.all_ua: - multiplier = self.all_ua._get_multiplier() - - if self.precision == PRECISION_DAY: - result += multiplier * appsettings.PADDING_DAY_PRECISION - result += multiplier * appsettings.PADDING_MONTH_PRECISION - result += multiplier * appsettings.PADDING_YEAR_PRECISION - elif self.precision == PRECISION_MONTH: - result += multiplier * appsettings.PADDING_MONTH_PRECISION - result += multiplier * appsettings.PADDING_YEAR_PRECISION - elif self.precision == PRECISION_YEAR: - result += multiplier * appsettings.PADDING_YEAR_PRECISION - - return result - - -class PartialUnspecified(Unspecified): - pass - - -class Consecutives(Interval): - # Treating Consecutive ranges as intervals where one bound is optional - def __init__(self, lower=None, upper=None): - if lower and not isinstance(lower, EDTFObject): - self.lower = Date.parse(lower) - else: - self.lower = lower - - if upper and not isinstance(upper, EDTFObject): - self.upper = Date.parse(upper) - else: - self.upper = upper - - def __str__(self): - return "{}..{}".format(self.lower or "", self.upper or "") - - -class EarlierConsecutives(Consecutives): - pass - - -class LaterConsecutives(Consecutives): - pass - - -class OneOfASet(EDTFObject): - @classmethod - def parse_action(cls, toks): - args = [t for t in toks.asList() if isinstance(t, EDTFObject)] - return cls(*args) - - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "[{}]".format(", ".join([str(o) for o in self.objects])) - - def _strict_date(self, lean): - if lean == LATEST: - return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) - - -class MultipleDates(EDTFObject): - @classmethod - def parse_action(cls, toks): - args = [t for t in toks.asList() if isinstance(t, EDTFObject)] - return cls(*args) - - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "{{{}}}".format(", ".join([str(o) for o in self.objects])) - - def _strict_date(self, lean): - if lean == LATEST: - return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) - - -class MaskedPrecision(Date): - pass - - -class Level2Interval(Level1Interval): - def __init__(self, lower, upper): - # Check whether incoming lower/upper values are single-item lists, and - # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of - # `PartialUncertainOrApproximate` items for lower/upper values. - if isinstance(lower, (tuple, list)) and len(lower) == 1: - self.lower = lower[0] - else: - self.lower = lower - if isinstance(lower, (tuple, list)) and len(upper) == 1: - self.upper = upper[0] - else: - self.upper = upper - - -class ExponentialYear(LongYear): - def __init__(self, base, exponent, precision=None): - self.base = base - self.exponent = exponent - self.precision = precision - - def _precise_year(self): - return int(self.base) * 10 ** int(self.exponent) - - def get_year(self): - if self.precision: - return f"{self.base}E{self.exponent}S{self.precision}" - else: - return f"{self.base}E{self.exponent}" - - year = property(get_year) diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index 52248f0..8b3c1d9 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -14,8 +14,8 @@ # where the first value is a tuple, the second item is a tuple of the normalised parse result. # # The values in the second tuple indicate the iso versions of the derived Python `date`s. -# - If there's one other value, all the derived dates should be the same. -# - If there're two other values, then all the lower values should be the same +# - If there is one other value, all the derived dates should be the same. +# - If there are two other values, then all the lower values should be the same # and all the upper values should be the same. # - If there are three other values, then the upper and lower ``_strict`` values # should be the first value, and the upper and lower ``_fuzzy`` values should be @@ -61,8 +61,11 @@ # Uncertain/Approximate # uncertain: possibly the year 1984, but not definitely ("1984?", ("1984-01-01", "1984-12-31", "1983-01-01", "1985-12-31")), - ("2004-06-11?", ("2004-06-11", "2004-06-11", "2004-06-10", "2004-06-12")), - ("2004-06?", ("2004-06-01", "2004-06-30", "2004-05-01", "2004-07-30")), + ( + "2004-06-11?", + ("2004-06-11", "2003-05-10", "2005-07-12"), + ), # everything is fuzzy by 100% for "qualification of a date (complete)" (L1) + ("2004-06?", ("2004-06-01", "2004-06-30", "2003-05-01", "2005-07-30")), # "approximately" the year 1984 ("1984~", ("1984-01-01", "1984-12-31", "1983-01-01", "1985-12-31")), # the year is approximately 1984 and even that is uncertain @@ -78,12 +81,23 @@ ("1999-01-XX", ("1999-01-01", "1999-01-31")), # some day in 1999 ("1999-XX-XX", ("1999-01-01", "1999-12-31")), + # negative unspecified year + ("-01XX", ("-0199-01-01", "-0100-12-31")), # Uncertain/Approximate lower boundary dates (BCE) ("-0275~", ("-0275-01-01", "-0275-12-31", "-0276-01-01", "-0274-12-31")), ("-0001~", ("-0001-01-01", "-0001-12-31", "-0002-01-01", "0000-12-31")), ("0000~", ("0000-01-01", "0000-12-31", "-0001-01-01", "0001-12-31")), + # Unspecified and qualified + # "circa 17th century" + ("16XX~", ("1600-01-01", "1699-12-31", "1500-01-01", "1799-12-31")), + ("16XX%", ("1600-01-01", "1699-12-31", "1400-01-01", "1899-12-31")), + ("1XXX", ("1000-01-01", "1999-12-31")), + ("1XXX~", ("1000-01-01", "1999-12-31", "0000-01-01", "2999-12-31")), + ("156X~", ("1560-01-01", "1569-12-31", "1550-01-01", "1579-12-31")), + ("-01XX~", ("-0199-01-01", "-0100-12-31", "-0299-01-01", "0000-12-31")), # L1 Extended Interval # beginning unknown, end 2006 + # for intervals with an unknown beginning or end, the unknown bound is calculated with the constant DELTA_IF_UNKNOWN (10 years) ("/2006", ("1996-12-31", "2006-12-31")), # beginning June 1, 2004, end unknown ("2004-06-01/", ("2004-06-01", "2014-06-01")), @@ -94,16 +108,16 @@ # interval beginning approximately 1984 and ending June 2004 ("1984~/2004-06", ("1984-01-01", "2004-06-30", "1983-01-01", "2004-06-30")), # interval beginning 1984 and ending approximately June 2004 - ("1984/2004-06~", ("1984-01-01", "2004-06-30", "1984-01-01", "2004-07-30")), + ("1984/2004-06~", ("1984-01-01", "2004-06-30", "1984-01-01", "2005-07-30")), ("1984?/2004%", ("1984-01-01", "2004-12-31", "1983-01-01", "2006-12-31")), ("1984~/2004~", ("1984-01-01", "2004-12-31", "1983-01-01", "2005-12-31")), # interval whose beginning is uncertain but thought to be 1984, and whose end is uncertain and approximate but thought to be 2004 - ("1984-06?/2004-08?", ("1984-06-01", "2004-08-31", "1984-05-01", "2004-09-30")), + ("1984-06?/2004-08?", ("1984-06-01", "2004-08-31", "1983-05-01", "2005-09-30")), ( "1984-06-02?/2004-08-08~", - ("1984-06-02", "2004-08-08", "1984-06-01", "2004-08-09"), + ("1984-06-02", "2004-08-08", "1983-05-01", "2005-09-09"), ), - ("1984-06-02?/", ("1984-06-02", "1994-06-02", "1984-06-01", "1994-06-02")), + ("1984-06-02?/", ("1984-06-02", "1994-06-02", "1983-05-01", "1994-06-02")), # Year exceeding 4 digits ("Y170000002", ("170000002-01-01", "170000002-12-31")), ("Y-170000002", ("-170000002-01-01", "-170000002-12-31")), @@ -113,28 +127,36 @@ ("2000-23", ("2000-09-01", "2000-11-30")), ("2010-24", ("2010-12-01", "2010-12-31")), # ******************************* LEVEL 2 ********************************* - # Partial Uncertain/Approximate + # Qualification + # Group qualification: a qualification character to the immediate right of a component applies + # to that component as well as to all components to the left. + # year, month, and day are uncertain and approximate + # this example appears under "group qualification" but actually parses as L1 UncertainOrApproximate + ( + "2004-06-11%", + ("2004-06-11", "2002-04-09", "2006-08-13"), + ), # all parts to the left are fuzzy by 200% # uncertain year; month, day known ("2004?-06-11", ("2004-06-11", "2003-06-11", "2005-06-11")), # year and month are approximate; day known ("2004-06~-11", ("2004-06-11", "2003-05-11", "2005-07-11")), - # uncertain month, year and day known - ("2004-?06-11", ("2004-06-11", "2004-05-11", "2004-07-11")), + # Qualification of individual component: a qualification character to the immediate left + # of the component applies to that component only # day is approximate; year, month known ("2004-06-~11", ("2004-06-11", "2004-06-10", "2004-06-12")), - # Year known, month within year is approximate and uncertain - NEW SPEC + # Year known, month within year is approximate and uncertain ("2004-%06", ("2004-06-01", "2004-06-30", "2004-04-01", "2004-08-30")), - # Year known, month and day uncertain - NEW SPEC + # Year known, month and day uncertain ("2004-?06-?11", ("2004-06-11", "2004-05-10", "2004-07-12")), - # Year uncertain, month known, day approximate - NEW SPEC + # Year uncertain, month known, day approximate ("2004?-06-~11", ("2004-06-11", "2003-06-10", "2005-06-12")), - # Year uncertain and month is both uncertain and approximate - NEW SPEC + # Year uncertain and month is both uncertain and approximate ("?2004-%06", ("2004-06-01", "2004-06-30", "2003-04-01", "2005-08-30")), # This has the same meaning as the previous example.- NEW SPEC ("2004?-%06", ("2004-06-01", "2004-06-30", "2003-04-01", "2005-08-30")), - # Year uncertain, month and day approximate. - NEW SPEC + # Year uncertain, month and day approximate ("2004?-~06-~04", ("2004-06-04", "2003-05-03", "2005-07-05")), - # Year known, month and day approximate. - NEW SPEC + # Year known, month and day approximate ("2011-~06-~04", ("2011-06-04", "2011-05-03", "2011-07-05")), # Partial unspecified # December 25 sometime during the 1560s @@ -154,12 +176,7 @@ # December 1760 or some later month ("[1760-12..]", ("1760-12-01", "inf")), # January or February of 1760 or December 1760 or some later month - # This test is failing due to a code issue: - # TypeError: '>' not supported between instances of 'float' and 'time.struct_time' - ( - "[1760-01, 1760-02, 1760-12..]", - ("1760-01-01", "inf"), - ), # TODO fix in parser_classes + ("[1760-01, 1760-02, 1760-12..]", ("1760-01-01", "inf")), # Either the year 1667 or the month December of 1760. ("[1667, 1760-12]", ("1667-01-01", "1760-12-31")), # Multiple Dates @@ -167,11 +184,11 @@ ("{1667,1668, 1670..1672}", ("1667-01-01", "1672-12-31")), # The year 1960 and the month December of 1961. ("{1960, 1961-12}", ("1960-01-01", "1961-12-31")), - # Masked Precision --> eliminated + # Previously tested masked precision, now eliminated from the spec # A date during the 1960s - # ('196x', '1960-01-01', '1969-12-31'), + ("196X", ("1960-01-01", "1969-12-31")), # A date during the 1900s - # ('19xx', '1900-01-01', '1999-12-31'), + ("19XX", ("1900-01-01", "1999-12-31")), # L2 Extended Interval # Interval with fuzzy day endpoints in June 2004 ( @@ -185,9 +202,23 @@ ("Y17E7", ("170000000-01-01", "170000000-12-31")), # the year -170000000 ("Y-17E7", ("-170000000-01-01", "-170000000-12-31")), + # L2 significant digits + # Some year between 1900 and 1999, estimated to be 1950 + ("1950S2", ("1950-01-01", "1950-12-31", "1900-01-01", "1999-12-31")), + ("1953S2", ("1953-01-01", "1953-12-31", "1900-01-01", "1999-12-31")), + ("1953S3", ("1953-01-01", "1953-12-31", "1950-01-01", "1959-12-31")), # Some year between 171010000 and 171999999, estimated to be 171010000 ('S3' indicates a precision of 3 significant digits.) - # TODO Not yet implemented, see https://github.com/ixc/python-edtf/issues/12 - # ('Y17101E4S3', ('171010000-01-01', '171999999-12-31')), + ( + "Y17101E4S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), + # Some year between 338000 and 338999, estimated to be 338800 + ("Y3388E2S3", ("338800-01-01", "338800-12-31", "338000-01-01", "338999-12-31")), + # some year between 171000000 and 171999999 estimated to be 171010000 + ( + "Y171010000S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), # L2 Seasons # Spring southern hemisphere, 2001 ("2001-29", ("2001-09-01", "2001-11-30")), @@ -195,7 +226,53 @@ ("2001-34", ("2001-04-01", "2001-06-30")), ) +BENCHMARK_EXAMPLES = ( + "2001-02-03", + "2008-12", + "2008", + "-0999", + "2004-01-01T10:10:10+05:00", + "-2005/-1999-02", + "/2006", + "?2004-%06", + "[1667, 1760-12]", + "Y3388E2S3", + "2001-29", +) + +APPROXIMATE_UNCERTAIN_EXAMPLES = ( + # first part of tuple is the input EDTF string, second part is a tuple of booleans: + # uncertain ?, approximate ~, both uncertain and approximate % + ("2004", (False, False, False)), + ("2006-06-11", (False, False, False)), + ("-0999", (False, False, False)), + ("1984?", (True, False, False)), + ("2004-06-11?", (True, False, False)), + ("1984~", (False, True, False)), + ("1984%", (False, False, True)), + ("1984~/2004-06", (False, True, False)), + ("2004-%06", (False, False, True)), + ("2004?-~06-~04", (True, True, False)), + ("2004?-06-04", (True, False, False)), + ("2011-~06-~04", (False, True, False)), + ("2004-06-~01/2004-06-~20", (False, True, False)), + ("156X~", (False, True, False)), + ("?1945/1959", (True, False, False)), + ("?1945", (True, False, False)), + ("?1945-01", (True, False, False)), + ("?1945-01-01", (True, False, False)), + ("~1945/1959", (False, True, False)), + ("~1945", (False, True, False)), + ("~1945-01", (False, True, False)), + ("~1945-01-01", (False, True, False)), + ("%1945/1959", (False, False, True)), + ("%1945", (False, False, True)), + ("%1945-01", (False, False, True)), + ("%1945-01-01", (False, False, True)), +) + BAD_EXAMPLES = ( + # parentheses are not used for group qualification in the 2018 spec None, "", "not a edtf string", @@ -247,31 +324,51 @@ def test_edtf_examples(test_input, expected_tuple): # Unpack expected results based on their count if len(expected_tuple) == 1: - assert ( - result_date == expected_tuple[0] - ), f"Expected {expected_tuple[0]}, got {result_date}" + assert result_date == expected_tuple[0], ( + f"Expected {expected_tuple[0]}, got {result_date}" + ) elif len(expected_tuple) == 2: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) - assert result.lower_strict() == lower_strict, "Lower strict date does not match" - assert result.upper_strict() == upper_strict, "Upper strict date does not match" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) elif len(expected_tuple) == 3: strict_date = iso_to_struct_time(expected_tuple[0]) lower_fuzzy = iso_to_struct_time(expected_tuple[1]) upper_fuzzy = iso_to_struct_time(expected_tuple[2]) - assert result.lower_strict() == strict_date, "Lower strict date does not match" - assert result.upper_strict() == strict_date, "Upper strict date does not match" - assert result.lower_fuzzy() == lower_fuzzy, "Lower fuzzy date does not match" - assert result.upper_fuzzy() == upper_fuzzy, "Upper fuzzy date does not match" + assert result.lower_strict() == strict_date, ( + f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" + ) + assert result.upper_strict() == strict_date, ( + f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) elif len(expected_tuple) == 4: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) lower_fuzzy = iso_to_struct_time(expected_tuple[2]) upper_fuzzy = iso_to_struct_time(expected_tuple[3]) - assert result.lower_strict() == lower_strict, "Lower strict date does not match" - assert result.upper_strict() == upper_strict, "Upper strict date does not match" - assert result.lower_fuzzy() == lower_fuzzy, "Lower fuzzy date does not match" - assert result.upper_fuzzy() == upper_fuzzy, "Upper fuzzy date does not match" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) @pytest.mark.parametrize("bad_input", BAD_EXAMPLES) @@ -281,6 +378,14 @@ def test_non_parsing(bad_input): parse(bad_input) +@pytest.mark.parametrize("bad_input", [None, ""]) +def test_empty_input(bad_input): + """Test that empty input raises a specific exception.""" + with pytest.raises(EDTFParseException) as exc_info: + parse(bad_input) + assert "You must supply some input text" in str(exc_info.value) + + def test_comparisons(): """Test comparisons between parsed EDTF objects and standard dates.""" d1 = parse("1979-08~") @@ -298,3 +403,24 @@ def test_comparisons(): assert d4 == d5 assert d1 < d5 assert d1 > d6 + + +@pytest.mark.benchmark +@pytest.mark.parametrize("test_input", BENCHMARK_EXAMPLES) +def test_benchmark_parser(benchmark, test_input): + """Benchmark parsing of selected EDTF strings.""" + benchmark(parse, test_input) + + +@pytest.mark.parametrize("test_input,expected_tuple", APPROXIMATE_UNCERTAIN_EXAMPLES) +def test_approximate_uncertain(test_input, expected_tuple): + """Test parsing of EDTF strings and check .is_uncertain, .is_approximate, + and .is_uncertain_and_approximate properties. The expected_tuple should have three + values, the first should be a boolean indicating if the date is uncertain, + the second should be a boolean indicating if the date is approximate, and the + third should be a boolean indicating if the date is both uncertain and approximate.""" + result = parse(test_input) + assert isinstance(result, EDTFObject), "Result should be an instance of EDTFObject" + assert result.is_uncertain == expected_tuple[0] + assert result.is_approximate == expected_tuple[1] + assert result.is_uncertain_and_approximate == expected_tuple[2] diff --git a/edtf/py.typed b/edtf/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/edtf/tests.py b/edtf/tests.py index 9812b65..837e580 100644 --- a/edtf/tests.py +++ b/edtf/tests.py @@ -4,6 +4,9 @@ from time import struct_time from edtf import convert +from edtf.parser.edtf_exceptions import EDTFParseException +from edtf.parser.grammar import parse_edtf +from edtf.util import remapparams def test_dt_to_struct_time_for_datetime(): @@ -107,3 +110,75 @@ def test_roll_negative_time_fields(): assert convert._roll_negative_time_fields( year, month, day, hour, minute, second ) == (-102, 5, 24, 21, 41, 47) + + +def test_remapparams(): + @remapparams(parseAll="parse_all") + def parser(s, parse_all=True): + pass + + assert parser.__name__ == "parser" # noqa: S101 + parser("foo") + # this should not warn + parser("foo", parse_all=False) + # this should warn, but only once + for _ in 1, 2: + parser("foo", parseAll=False) + try: + parser("foo", parseAll=False, parse_all=True) + except ValueError: + pass + else: + raise AssertionError("expected ValueError because of duplicated parameters") + + try: + + @remapparams() + def no_remappings(): + pass + except ValueError: + pass + else: + raise AssertionError( + "expected ValueError from @remapparams() because no remappings" + ) + try: + + @remapparams(p1="p2", p2="p3") + def no_remappings(): + pass + except ValueError: + pass + else: + raise AssertionError( + "expected ValueError from @remapparams() because p1 remaps to another remapped parameter" + ) + + +def test_remapparams_parse_edtf(): + edtf_s = "2005-09-24T10:00:00" # ISO8601 example from the EDTF spec + dat = parse_edtf(edtf_s) # implicit parse_all=True + assert dat.isoformat() == edtf_s + assert parse_edtf(edtf_s, parse_all=True).isoformat() == edtf_s + assert parse_edtf(edtf_s, parseAll=True).isoformat() == edtf_s + assert parse_edtf(f"{edtf_s} SNORT", parse_all=False).isoformat() == edtf_s + assert parse_edtf(f"{edtf_s} SNORT", parseAll=False).isoformat() == edtf_s + # make sure parse_all=True fails the SNORT parse + try: + parse_edtf(f"{edtf_s} SNORT") + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") + try: + parse_edtf(f"{edtf_s} SNORT", parse_all=True) + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") + try: + parse_edtf(f"{edtf_s} SNORT", parseAll=True) + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") diff --git a/edtf/util.py b/edtf/util.py new file mode 100644 index 0000000..146eec2 --- /dev/null +++ b/edtf/util.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +""" +Assorted utility functions. +""" + +from functools import update_wrapper +from logging import warning +from traceback import extract_stack + + +def remapparams(**remap): + """ + Remap the specified named parameters. + + Example to support an obsolete `parseAll` parameter: + + @remapparams(parseAll='parse_all') + def parse(s, parse_all=True): + + """ + if not remap: + raise ValueError("no parameters specified for remapping") + for old, new in remap.items(): + if new in remap: + raise ValueError(f"{old}={new!r}: {new!r} is also remapped") + + def remapparams_decorator(func): + """The decorator to apply the remappings.""" + # a record of callers whose parameters were remapped + remapped_callers = set() + + def remapparams_wrapper(*a, **kw): + remappings = {} + for param, value in list(kw.items()): + try: + remapped = remap[param] + except KeyError: + continue + if remapped in kw: + raise ValueError( + f"remap {param}= to {remapped}=: this is already present in the keyword arguments" + ) + del kw[param] + kw[remapped] = value + remappings[param] = remapped + if remappings: + caller_frame = extract_stack(limit=2)[-2] + caller_key = caller_frame.filename, caller_frame.lineno + if caller_key not in remapped_callers: + warning( + "call of %s.%s() from %s:%d: remapped the following obsolete parameters: %s", + func.__module__, + func.__name__, + caller_frame.filename, + caller_frame.lineno, + ", ".join( + sorted(f"{old}->{new}" for old, new in remappings.items()) + ), + ) + remapped_callers.add(caller_key) + return func(*a, **kw) + + update_wrapper(remapparams_wrapper, func) + return remapparams_wrapper + + return remapparams_decorator diff --git a/edtf_django_tests/edtf_integration/admin.py b/edtf_django_tests/edtf_integration/admin.py index 846f6b4..3051891 100644 --- a/edtf_django_tests/edtf_integration/admin.py +++ b/edtf_django_tests/edtf_integration/admin.py @@ -1 +1,43 @@ -# Register your models here. +from django.contrib import admin + +from .models import TestEvent + + +class TestEventAdmin(admin.ModelAdmin): + list_display = ( + "date_display", + "date_edtf_direct", + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + search_fields = ("date_display", "date_edtf_direct") + list_filter = ("date_earliest", "date_latest") + readonly_fields = ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + + fieldsets = ( + (None, {"fields": ("date_display", "date_edtf_direct", "date_edtf")}), + ( + "Computed Dates", + { + "classes": ("collapse",), + "fields": ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + ), + }, + ), + ) + + +admin.site.register(TestEvent, TestEventAdmin) diff --git a/edtf_django_tests/edtf_integration/models.py b/edtf_django_tests/edtf_integration/models.py index 5120889..5e66592 100644 --- a/edtf_django_tests/edtf_integration/models.py +++ b/edtf_django_tests/edtf_integration/models.py @@ -49,9 +49,5 @@ def __str__(self) -> str: return ( f"Test Event: {self.date_display=}, " f"{self.date_edtf_direct=}, " - f"{self.date_earliest=}, " - f"{self.date_latest=}, " - f"{self.date_sort_ascending=}, " - f"{self.date_sort_descending=}, " f"{self.date_edtf=}" ) diff --git a/edtf_django_tests/edtf_integration/tests.py b/edtf_django_tests/edtf_integration/tests.py index 88fdca8..aa1bf34 100644 --- a/edtf_django_tests/edtf_integration/tests.py +++ b/edtf_django_tests/edtf_integration/tests.py @@ -74,6 +74,26 @@ def test_date_display(self): self.assertEqual(self.event3.date_display, "2019-11") self.assertEqual(self.event4.date_display, "Approximately August 2018") + def test_date_display_with_none_or_empty_string(self): + """ + Test that the date_display field is correctly populated when the + `natural_date` field is set to empty string (for example, if it + were used with `null=False` in the model definition) or set to + None (if it were used with `null=True`). + """ + event = TestEvent(date_display="") + event.date_edtf_direct = "2020-03-15/2020-04-15" + # Trigger the descriptor to update the date_display field + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + + event = TestEvent(date_display=None) + # Verify date_display is set to None even though the field is `null=False` + self.assertIsNone(event.date_display) + event.date_edtf_direct = "2020-03-15/2020-04-15" + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + def test_comparison(self): # test equality of the same dates self.assertEqual( @@ -102,3 +122,34 @@ def test_comparison(self): self.event2.date_edtf, "2019-11 is less than 2021-05-06", ) + + def test_field_related_field_specification(self): + edtf_field_on_model = TestEvent._meta.get_field("date_edtf") + required_fields = ( + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ) + for field_alias in required_fields: + # Remove the alias from the edtf_field + orig_value = getattr(edtf_field_on_model, field_alias) + setattr(edtf_field_on_model, field_alias, None) + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be an 'alias not specified' error + self.assertEqual(errors[0].id, "python-edtf.EDTF01") + + # Point the alias to a non-existent field + setattr(edtf_field_on_model, field_alias, "fake") + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be a 'non-eixstent field' error + self.assertEqual(errors[0].id, "python-edtf.EDTF02") + + # Repair the field so later tests can still work + setattr(edtf_field_on_model, field_alias, orig_value) diff --git a/pyproject.toml b/pyproject.toml index 869daf6..5915bde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,25 @@ [project] name = "edtf" version = "5.0.0" +license = { file = "LICENSE" } +keywords = ['edtf'] dependencies = [ "python-dateutil", - "pyparsing", - "six" + "pyparsing>=3.0.0", ] description = "Python implementation of Library of Congress EDTF (Extended Date Time Format) specification" -requires-python = ">=3.8" -readme = {file = "README.txt", content-type = "text/markdown"} +requires-python = ">=3.10" +readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au"}, { name = "Alastair Weakley"}, + { name = "Greg Turner"}, { name = "James Murty"}, { name = "Mark Finger" }, { name = "Sabine Müller" }, - { name = "Cole Crawford" } + { name = "Cole Crawford" }, + { name = "Klaus Rettinghaus" }, + { name = "Andrew Hankinson", email = "andrew.hankinson@rism.digital" }, ] maintainers = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au" } @@ -32,14 +36,23 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] [project.optional-dependencies] test = [ "django>=4.2,<5.0", "pytest", + "pytest-django", + "pytest-benchmark", "ruff", "pre-commit", + "coverage", + "pytest-cov", + "junitparser", + "mypy>=1.15.0", + "pip>=25.1.1", ] [project.urls] @@ -77,11 +90,32 @@ legacy_tox_ini = """ python_files = ["tests.py", "test_*.py", "*_test.py", "*_tests.py"] python_classes = ["Test*", "*Tests"] python_functions = ["test_*"] -addopts = "--ignore=edtf_django_tests/" +markers = [ + "benchmark: mark a test as a benchmark", +] +addopts = "--ignore=edtf_django_tests/ --cov=edtf -m 'not benchmark'" +plugins = ["pytest_cov", "pytest_benchmark"] + +[tool.coverage.run] +# we run the edtf_integration tests but only care about them testing fields.py in the main package +omit = [ + "edtf_django_tests/*" +] + +[tool.coverage.report] +exclude_lines = [ + # Don't complain about missing debug-only code: + "if __name__ == .__main__.:", + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + "raise NotImplemented", + "raise NotImplemented" +] [tool.ruff] # Python 3.8 -target-version = "py38" +target-version = "py311" extend-exclude = [ '**/migrations/*', @@ -117,4 +151,6 @@ ignore = [ "E501", # Ignore McCabe complexity (for now). "C901", + # Ignore percent format -> format specifier rule for now (pending merge of #73 which resolves them) + "UP031", ] diff --git a/requirements.txt b/requirements.txt index 0ab3a7d..f142bc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ python-dateutil -pyparsing -six +pyparsing >= 3.0.0 diff --git a/vagrant wheel install problems.txt b/vagrant wheel install problems.txt deleted file mode 100644 index 174f67e..0000000 --- a/vagrant wheel install problems.txt +++ /dev/null @@ -1,5 +0,0 @@ -vagrant wheel install problems -https://stackoverflow.com/questions/56851961/how-to-fix-no-such-file-or-directory-error-in-setuptools-wheel-py157-convert - -from that link: -So it turns out that this problem was being caused by lag in Vagrant/Virtualbox's synced folders. I was trying to build the Python project inside a Vagrant VM shared from the host file system using a synced folder. Copying the project out of the synced folder into another folder in the VM allows it to build. Another dirty hack that worked was to add a time.sleep(1) in the setuptools/wheel.py source file on line 157 before the os.rename that was causing the OS Exception to be raised. This gives the file system a chance to sync, and therefore works around the issue. \ No newline at end of file