diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d5416ed..6c44397 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,6 +8,8 @@ on: permissions: checks: write contents: write + # deployments permission to deploy GitHub pages website + deployments: write pull-requests: write @@ -16,16 +18,16 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] defaults: run: working-directory: . steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -44,13 +46,13 @@ jobs: - name: Run unit tests run: | - pytest + pytest --junitxml=junit_pytest_main.xml --cov-report=term-missing:skip-covered mv .coverage .coverage_main - name: Run Django integration tests working-directory: ./edtf_django_tests run: | - coverage run manage.py test edtf_integration + pytest edtf_integration/tests.py --ds=edtf_django_tests.settings --junitxml=../junit_pytest_django.xml --cov-report=term-missing:skip-covered mv .coverage ../.coverage_django - name: Combine coverage reports @@ -59,11 +61,16 @@ jobs: coverage report --omit="edtf_django_tests/*" coverage xml -o coverage_combined.xml --omit="edtf_django_tests/*" + - name: Combine JUnit XML reports + run: | + python combine_junit.py combined_junit_pytest.xml junit_pytest_main.xml junit_pytest_django.xml + - name: Pytest coverage comment id: coverageComment uses: MishaKav/pytest-coverage-comment@main with: pytest-xml-coverage-path: ./coverage_combined.xml + junitxml-path: ./combined_junit_pytest.xml unique-id-for-comment: ${{ matrix.python-version }} github-token: ${{ secrets.GITHUB_TOKEN }} @@ -71,4 +78,49 @@ jobs: run: | echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}" echo "Coverage Color - ${{ steps.coverageComment.outputs.color }}" + echo "Coverage Html - ${{ steps.coverageComment.outputs.coverageHtml }}" + echo "Summary Report -" ${{ steps.coverageComment.outputs.summaryReport }} echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}" + echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}" + echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}" + echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}" + echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}" + echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}" + echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}" + + - name: Run benchmarks + run: | + pytest -m benchmark --benchmark-json=./output.json + + - name: Download previous benchmark data + uses: actions/cache@v5 + with: + path: ./cache + key: ${{ runner.os }}-benchmark + + - name: Publish benchmark results + uses: benchmark-action/github-action-benchmark@v1 + if: github.event_name == 'pull_request' && github.repository == 'ixc/python-edtf' + with: + tool: 'pytest' + auto-push: true + comment-always: true + output-file-path: output.json + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-on-alert: true + save-data-file: true + summary-always: true + + - name: Comment on benchmark results without publishing + if: github.event_name != 'pull_request' || github.repository != 'ixc/python-edtf' + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: 'pytest' + auto-push: false + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: true + output-file-path: output.json + comment-on-alert: false + save-data-file: true + summary-always: true + external-data-json-path: ./cache/benchmark-data.json diff --git a/.github/workflows/coverage_readme.yml b/.github/workflows/coverage_readme.yml index 86309de..860ace3 100644 --- a/.github/workflows/coverage_readme.yml +++ b/.github/workflows/coverage_readme.yml @@ -17,15 +17,15 @@ jobs: update-coverage-on-readme: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: persist-credentials: false fetch-depth: 0 - - name: Set up Python 3.12 - uses: actions/setup-python@v5 + - name: Set up Python 3.13 + uses: actions/setup-python@v6 with: - python-version: 3.12 + python-version: 3.13 cache: 'pip' cache-dependency-path: '**/pyproject.toml' @@ -59,10 +59,9 @@ jobs: run: | sed -i '//,//c\\n\${{ steps.coverageComment.outputs.coverageHtml }}\n' ./README.md - - name: Commit & Push changes to README - run: | - git config --global user.name 'github-actions[bot]' - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git add README.md - git commit -m 'Update coverage badge in README' - git push + - name: Commit & Push changes to Readme + if: ${{ github.ref == 'refs/heads/main' }} + uses: actions-js/push@master + with: + message: Update coverage on Readme + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 182cf8b..d27f79d 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,10 @@ coverage_combined.xml .coverage_main .coverage_django *,cover +combined_junit_pytest.xml +pytest.xml +junit_pytest_main.xml +junit_pytest_django.xml # Translations *.mo @@ -60,3 +64,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store diff --git a/README.md b/README.md index c4f172e..206f4f4 100644 --- a/README.md +++ b/README.md @@ -1,83 +1,98 @@ -edtf -===== +# python-edtf +Coverage
Coverage Report
FileStmtsMissCoverMissing
edtf
   __init__.py40100% 
   appsettings.py29293%12–13
   convert.py631182%11–19, 21, 72
   fields.py1191190%1, 3–8, 10–13, 15, 23, 29, 31, 33–35, 38–39, 51–58, 60, 63, 65–70, 72–76, 78–79, 81, 83–84, 86, 88–89, 91, 93–95, 97–98, 100, 102–105, 107, 109–112, 114, 123–125, 128, 131–132, 135–136, 139–140, 142–144, 147, 151, 153, 155, 157, 160–173, 179, 181–182, 184–185, 190–191, 193–194, 196, 204, 206, 208–209, 212–213, 223–226, 234
   jdutil.py984455%37, 55, 91–92, 287, 291, 314, 316–317, 319, 321, 346, 348, 350, 370–372, 374, 376, 378, 381–383, 385, 387, 389, 392–393, 395, 397, 399–400, 402, 405–407, 410–413, 415, 417, 424, 431
   tests.py119496%137–138, 148–149
   util.py330100% 
edtf/natlang
   __init__.py20100% 
   en.py1581193%85, 88, 145, 181–182, 192–193, 218–219, 223, 290
   tests.py10190%211
edtf/parser
   __init__.py40100% 
   edtf_exceptions.py15286%24, 29
   grammar.py133496%365, 368, 370, 376
   parser_classes.py66110683%114–116, 123, 126, 188, 194–198, 205–207, 214–218, 227–229, 234–240, 273, 345, 358–359, 390–394, 397, 412, 415–419, 422–426, 444–446, 474, 483, 546, 560, 564, 600, 608, 612, 659–660, 666, 684–685, 688, 694, 700, 702, 706, 713, 754, 779, 785, 789, 804, 808, 898, 908, 919–920, 922, 929, 932, 943, 948, 953, 989, 992, 998, 1003, 1005–1013, 1028, 1033, 1111, 1116, 1149
   tests.py89198%412
TOTAL153730580% 
An implementation of EDTF format in Python, together with utility functions for parsing natural language date texts, and converting EDTF dates to related Python `date` or `struct_time` objects. -See http://www.loc.gov/standards/datetime/ for the current draft specification. - -This project is based on python-edtf and was developed to include the newest specification +See for the final draft specification. ## To install - pip install edtf +```shell +pip install edtf +``` ## To use - >>> from edtf import parse_edtf - # Parse an EDTF string to an EDTFObject - >>> e = parse_edtf("1979-08~") # approx August 1979 - >>> e - UncertainOrApproximate: '1979-08~' - # normalised string representation (some different EDTF strings have identical meanings) - >>> unicode(e) - u'1979-08~' - - # Derive Python date objects - # lower and upper bounds that strictly adhere to the given range - >>> e.lower_strict()[:3], e.upper_strict()[:3] - ((1979, 8, 1), (1979, 8, 31)) - # lower and upper bounds that are padded if there's indicated uncertainty - >>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] - ((1979, 7, 1), (1979, 9, 30)) - - # Date intervals - >>> interval = parse_edtf("1979-08~/..") - >>> interval - Level1Interval: '1979-08~/..' - # Intervals have lower and upper EDTF objects. - >>> interval.lower, interval.upper - (UncertainOrApproximate: '1979-08~', UnspecifiedIntervalSection: '..') - >>> interval.lower.lower_strict()[:3], interval.lower.upper_strict()[:3] - ((1979, 8, 1), (1979, 8, 31)) - >>> interval.upper.upper_strict() # '..' is interpreted to mean open interval and is returning -/+ math.inf - math.inf - - # Date collections - >>> coll = parse_edtf('{1667,1668, 1670..1672}') - >>> coll - MultipleDates: '{1667, 1668, 1670..1672}' - >>> coll.objects - (Date: '1667', Date: '1668', Consecutives: '1670..1672') +```python +>>> from edtf import parse_edtf + +# Parse an EDTF string to an EDTFObject +>>> +>>> e = parse_edtf("1979-08~") # approx August 1979 +>>> e +UncertainOrApproximate: '1979-08~' + +# normalised string representation (some different EDTF strings have identical meanings) +>>> +>>> str(e) +'1979-08~' + +# Derive Python date objects + +# lower and upper bounds that strictly adhere to the given range +>>> +>>> e.lower_strict()[:3], e.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) + +# lower and upper bounds that are padded if there's indicated uncertainty +>>> +>>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] +((1979, 7, 1), (1979, 9, 30)) + +# Date intervals +>>> +>>> interval = parse_edtf("1979-08~/..") +>>> interval +Level1Interval: '1979-08~/..' + +# Intervals have lower and upper EDTF objects +>>> +>>> interval.lower, interval.upper +(UncertainOrApproximate: '1979-08~', UnspecifiedIntervalSection: '..') +>>> interval.lower.lower_strict()[:3], interval.lower.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) +>>> interval.upper.upper_strict() # '..' is interpreted to mean open interval and is returning -/+ math.inf +math.inf + +# Date collections +>>> +>>> coll = parse_edtf('{1667,1668, 1670..1672}') +>>> coll +MultipleDates: '{1667, 1668, 1670..1672}' +>>> coll.objects +(Date: '1667', Date: '1668', Consecutives: '1670..1672') +``` The object returned by `parse_edtf()` is an instance of an `edtf.parser.parser_classes.EDTFObject` subclass, depending on the type of date that was parsed. These classes are: - # Level 0 - Date - DateAndTime - Interval - - # Level 1 - UncertainOrApproximate - Unspecified - Level1Interval - UnspecifiedIntervalSection - LongYear - Season - - # Level 2 - PartialUncertainOrApproximate - PartialUnspecified - OneOfASet - MultipleDates - MaskedPrecision - Level2Interval - Level2Season - ExponentialYear - -All of these implement `upper/lower_strict/fuzzy()` methods to derive `struct_time` objects, except of UnspecifiedIntervalSection, that can also return math.inf value +```text +# Level 0 +Date +DateAndTime +Interval + +# Level 1 +UncertainOrApproximate +Unspecified +Level1Interval +UnspecifiedIntervalSection +LongYear +Season + +# Level 2 +PartialUncertainOrApproximate +PartialUnspecified +OneOfASet +MultipleDates +Level2Interval +Level2Season +ExponentialYear +``` + +All of these implement `upper_strict()/lower_strict()/fuzzy()` methods to derive `struct_time` objects, except of UnspecifiedIntervalSection, that can also return math.inf value The `*Interval` instances have `upper` and `lower` properties that are themselves `EDTFObject` instances. @@ -93,153 +108,209 @@ Test coverage includes every example given in the spec table of features. * Date: - >>> parse_edtf('1979-08') # August 1979 - Date: '1979-08' +```python +>>> parse_edtf('1979-08') # August 1979 +Date: '1979-08' +``` * Date and Time: - >>> parse_edtf('2004-01-01T10:10:10+05:00') - DateAndTime: '2004-01-01T10:10:10+05:00' +```python +>>> parse_edtf('2004-01-01T10:10:10+05:00') +DateAndTime: '2004-01-01T10:10:10+05:00' +``` * Interval (start/end): - >>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 - Interval: '1979-08-28/1979-09-25' +```python +>>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 +Interval: '1979-08-28/1979-09-25' +``` ### Level 1 Extensions * Uncertain/Approximate dates: - >>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 - UncertainOrApproximate: '1979-08-28~' +```python +>>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 +UncertainOrApproximate: '1979-08-28~' +``` * Unspecified dates: - >>> parse_edtf('1979-08-XX') # An unknown day in August 1979 - Unspecified: '1979-08-XX' - >>> parse_edtf('1979-XX') # Some month in 1979 - Unspecified: '1979-XX' +```python +>>> parse_edtf('1979-08-XX') # An unknown day in August 1979 +Unspecified: '1979-08-XX' +>>> parse_edtf('1979-XX') # Some month in 1979 +Unspecified: '1979-XX' +``` * Extended intervals: - >>> parse_edtf('1984-06-02?/2004-08-08~') - Level1Interval: '1984-06-02?/2004-08-08~' +```python +>>> parse_edtf('1984-06-02?/2004-08-08~') +Level1Interval: '1984-06-02?/2004-08-08~' +``` * Years exceeding four digits: - >>> parse_edtf('Y-12000') # 12000 years BCE - LongYear: 'Y-12000' +```python +>>> parse_edtf('Y-12000') # 12000 years BCE +LongYear: 'Y-12000' +``` * Season: - >>> parse_edtf('1979-22') # Summer 1979 - Season: '1979-22' +```python +>>> parse_edtf('1979-22') # Summer 1979 +Season: '1979-22' +``` ### Level 2 Extensions * Partial uncertain/approximate: - >>> parse_edtf('(2011)-06-04~') # year certain, month/day approximate. - # Note that the result text is normalized - PartialUncertainOrApproximate: '2011-(06-04)~' +```python +>>> parse_edtf('2004-06~-11') # year certain, month/day approximate. +PartialUncertainOrApproximate: '2004-06~-11' +``` * Partial unspecified: - >>> parse_edtf('1979-XX-28') # The 28th day of an uncertain month in 1979 - PartialUnspecified: '1979-XX-28' +```python +>>> parse_edtf('1979-XX-28') # The 28th day of an uncertain month in 1979 +PartialUnspecified: '1979-XX-28' +``` * One of a set: - >>> parse_edtf("[..1760-12-03,1762]") - OneOfASet: '[..1760-12-03, 1762]' +```python +>>> parse_edtf("[..1760-12-03,1762]") +OneOfASet: '[..1760-12-03, 1762]' +``` * Multiple dates: - >>> parse_edtf('{1667,1668, 1670..1672}') - MultipleDates: '{1667, 1668, 1670..1672}' - -* Masked precision: - - >>> parse_edtf('197x') # A date in the 1970s. - MaskedPrecision: '197x' +```python +>>> parse_edtf('{1667,1668, 1670..1672}') +MultipleDates: '{1667, 1668, 1670..1672}' +``` * Level 2 Extended intervals: - >>> parse_edtf('2004-06-(01)~/2004-06-(20)~') - Level2Interval: '2004-06-(01)~/2004-06-(20)~' +```python +>>> parse_edtf('2004-06-~01/2004-06-~20') +Level2Interval: '2004-06-~01/2004-06-~20' +``` * Year requiring more than 4 digits - exponential form: - >>> parse_edtf('Y-17e7') - ExponentialYear: 'Y-17e7' +```python +>>> e = parse_edtf('Y-17E7') +ExponentialYear: 'Y-17E7' +>>> e.estimated() +-170000000 +``` + +* Significant digits: + +```python +# '1950S2': some year between 1900 and 1999, estimated to be 1950 +>>> d = parse_edtf('1950S2') +Date: '1950S2' +>>> d.lower_fuzzy()[:3] +(1900, 1, 1) +>>> d.upper_fuzzy()[:3] +(1999, 12, 31) +# 'Y171010000S3': some year between 171000000 and 171999999 estimated to be 171010000, with 3 significant digits. +>>> l = parse_edtf('Y171010000S3') +LongYear: 'Y171010000S3' +>>> l.estimated() +171010000 +>>> l.lower_fuzzy()[:3] +(171000000, 1, 1) +>>> l.upper_fuzzy()[:3] +(171999999, 12, 31) +# 'Y3388E2S3': some year in exponential notation between 338000 and 338999, estimated to be 338800 +>>> e = parse_edtf('Y3388E2S3') +ExponentialYear: 'Y3388E2S3S3' +>>> e.estimated() +338800 +>>> e.lower_fuzzy()[:3] +(338000, 1, 1) +>>> e.upper_fuzzy()[:3] +(338999, 12, 31) +``` ### Natural language representation - The library includes a basic English natural language parser (it's not yet smart enough to work with occasions such as 'Easter', or in other languages): - >>> from edtf import text_to_edtf - >>> text_to_edtf("circa August 1979") - '1979-08~' +```python +>>> from edtf import text_to_edtf +>>> text_to_edtf("circa August 1979") +'1979-08~' +``` -Note that the result is a string, not an `ETDFObject`. +Note that the result is a string, not an `EDTFObject`. The parser can parse strings such as: - 'January 12, 1940' => '1940-01-12' - '90' => '1990' #implied century - 'January 2008' => '2008-01' - 'the year 1800' => '1800' - '10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering - - # uncertain/approximate - '1860?' => '1860?' - '1862 (uncertain)' => '1862?' - 'circa Feb 1812' => '1812-02~' - 'c.1860' => '1860~' #with or without . - 'ca1860' => '1860~' - 'approx 1860' => '1860~' - 'ca. 1860s' => '186X~' - 'circa 1840s' => '184X~' - 'ca. 1860s?' => '186X?~' - 'c1800s?' => '180X?~' # with uncertainty indicators, use the decade - - # unspecified parts - 'January 12' => 'XXXX-01-12' - 'January' => 'XXXX-01' - '7/2008' => '2008-07' - 'month in 1872' => '1872-XX' - 'day in January 1872' => '1872-01-XX' - 'day in 1872' => '1872-XX-XX' - - #seasons - 'Autumn 1872' => '1872-23' - 'Fall 1872' => '1872-23' - - # before/after - 'earlier than 1928' => '/1928' - 'later than 1928' => '1928/' - 'before January 1928' => '/1928-01' - 'after about the 1920s' => '192X~/' - - #centuries - '1st century' => '00XX' - '10c' => '09XX' - '19th century?' => '18XX?' - - # just showing off now... - 'a day in about Spring 1849?' => '1849-21-XX?~' - - # simple ranges, which aren't as accurate as they could be. The parser is - limited to only picking the first year range it finds. - '1851-1852' => '1851/1852' - '1851-1852; printed 1853-1854' => '1851/1852' - '1851-52' => '1851/1852' - '1856-ca. 1865' => '1856/1865~' - '1860s-1870s' => '186X/187X' - '1920s - early 1930s' => '192X/193X' - '1938, printed 1940s-1950s' => '1938' - +```text +'January 12, 1940' => '1940-01-12' +'90' => '1990' #implied century +'January 2008' => '2008-01' +'the year 1800' => '1800' +'10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering + +# uncertain/approximate +'1860?' => '1860?' +'1862 (uncertain)' => '1862?' +'circa Feb 1812' => '1812-02~' +'c.1860' => '1860~' #with or without . +'ca1860' => '1860~' +'approx 1860' => '1860~' +'ca. 1860s' => '186X~' +'circa 1840s' => '184X~' +'ca. 1860s?' => '186X?~' +'c1800s?' => '180X?~' # with uncertainty indicators, use the decade + +# unspecified parts +'January 12' => 'XXXX-01-12' +'January' => 'XXXX-01' +'7/2008' => '2008-07' +'month in 1872' => '1872-XX' +'day in January 1872' => '1872-01-XX' +'day in 1872' => '1872-XX-XX' + +#seasons +'Autumn 1872' => '1872-23' +'Fall 1872' => '1872-23' + +# before/after +'earlier than 1928' => '/1928' +'later than 1928' => '1928/' +'before January 1928' => '/1928-01' +'after about the 1920s' => '192X~/' + +#centuries +'1st century' => '00XX' +'10c' => '09XX' +'19th century?' => '18XX?' + +# just showing off now... +'a day in about Spring 1849?' => '1849-21-XX?~' + +# simple ranges, which aren't as accurate as they could be. The parser is +limited to only picking the first year range it finds. +'1851-1852' => '1851/1852' +'1851-1852; printed 1853-1854' => '1851/1852' +'1851-52' => '1851/1852' +'1856-ca. 1865' => '1856/1865~' +'1860s-1870s' => '186X/187X' +'1920s - early 1930s' => '192X/193X' +'1938, printed 1940s-1950s' => '1938' +``` Generating natural text from an EDTF representation is a future goal. @@ -253,22 +324,20 @@ Generating natural text from an EDTF representation is a future goal. * If a natural language groups dates with a '/', it's interpreted as "or" rather than "and". The resulting EDTF text is a list bracketed by `[]` ("one of these dates") rather than `{}` (all of these dates). - ## Converting to and from Python dates - Since EDTF dates are often regions, and often imprecise, we need to use a few different Python dates, depending on the circumstance. Generally, Python dates are used for sorting and filtering, and are not displayed directly to users. - ### `struct_time` date representation -Because Python's `datetime` module does not support dates out side the range 1 AD to 9999 AD we return dates as `time.struct_time` objects by default instead of the `datetime.date` or `datetime.datetime` objects you might expect. +Because Python's `datetime` module does not support dates outside the range 1 AD to 9999 AD we return dates as `time.struct_time` objects by default instead of the `datetime.date` or `datetime.datetime` objects you might expect. The `struct_time` representation is more difficult to work with, but can be sorted as-is which is the primary use-case, and can be converted relatively easily to `date` or `datetime` objects (provided the year is within 1 to 9999 AD) or to date objects in more flexible libraries like [astropy.time](http://docs.astropy.org/en/stable/time/index.html) for years outside these bounds. If you are sure you are working with dates within the range supported by Python's `datetime` module, you can get these more convenient objects using the `edtf.struct_time_to_date` and `edtf.struct_time_to_datetime` functions. -NOTE: This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket https://github.com/ixc/python-edtf/issues/26. +> [!NOTE] +> This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket . ### `lower_strict` and `upper_strict` @@ -276,56 +345,109 @@ These dates indicate the earliest and latest dates that are __strictly__ in the In an ascending sort (most recent last), sort by `lower_strict` to get a natural sort order. In a descending sort (most recent first), sort by `upper_strict`: - >>> e = parse_edtf('1912-04~') +```python +>>> e = parse_edtf('1912-04~') - >>> e.lower_strict() # Returns struct_time - >>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) +>>> e.lower_strict() # Returns struct_time +>>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) - >>> e.lower_strict()[:3] # Show only interesting parts of struct_time - (1912, 4, 01) +>>> e.lower_strict()[:3] # Show only interesting parts of struct_time +(1912, 4, 01) - >>> from edtf import struct_time_to_date - >>> struct_time_to_date(e.lower_strict()) # Convert to date - datetime.date(1912, 4, 01) +>>> from edtf import struct_time_to_date +>>> struct_time_to_date(e.lower_strict()) # Convert to date +datetime.date(1912, 4, 01) - >>> e.upper_strict()[:3] - (1912, 4, 30) +>>> e.upper_strict()[:3] +(1912, 4, 30) - >>> struct_time_to_date(e.upper_strict()) - datetime.date(1912, 4, 30) +>>> struct_time_to_date(e.upper_strict()) +datetime.date(1912, 4, 30) +``` ### `lower_fuzzy` and `upper_fuzzy` ------------------------------------ These dates indicate the earliest and latest dates that are __possible__ in the date range, for a fairly arbitrary definition of 'possibly'. These values are useful for filtering results - i.e. testing which EDTF dates might conceivably fall into, or overlap, a desired date range. -The fuzzy dates are derived from the strict dates, plus or minus a level of padding that depends on how precise the date specfication is. For the case of approximate or uncertain dates, we (arbitrarily) pad the ostensible range by 100% of the uncertain timescale, or by a 12 weeks in the case of seasons. That is, if a date is approximate at the month scale, it is padded by a month. If it is approximate at the year scale, it is padded by a year: +The fuzzy dates are derived from the strict dates, plus or minus a level of padding that depends on how precise the date specification is. For the case of approximate or uncertain dates, we (arbitrarily) pad the ostensible range by 100% of the uncertain timescale, or by a 12 weeks in the case of seasons. That is, if a date is approximate at the month scale, it is padded by a month. If it is approximate at the year scale, it is padded by a year: - >>> e = parse_edtf('1912-04~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a month - (1912, 3, 1) - >>> e.upper_fuzzy()[:3] - (1912, 5, 30) +```python +>>> e = parse_edtf('1912-04~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a month +(1912, 3, 1) +>>> e.upper_fuzzy()[:3] +(1912, 5, 30) - >>> e = parse_edtf('1912~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a year - (1911, 1, 1) - >>> e.upper_fuzzy()[:3] - (1913, 12, 31) +>>> e = parse_edtf('1912~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a year +(1911, 1, 1) +>>> e.upper_fuzzy()[:3] +(1913, 12, 31) +``` One can interpret uncertain or approximate dates as 'plus or minus a [level of precision]'. If a date is both uncertain __and__ approximate, the padding is applied twice, i.e. it gets 100% * 2 padding, or 'plus or minus two [levels of precision]'. +### Qualification properties + +EDTF objects support properties that provide an overview of how the object is qualified: + +* `.is_uncertain (?)` +* `.is_approximate (~)` +* `.is_uncertain_and_approximate (%)` + +These properties represent whether the any part of the date object is uncertain, approximate, or uncertain and approximate. For ranges, the properties are true if any part of the range (lower or upper section) is qualified as such. A date is not necessarily uncertain and approximate if it is separately both uncertain and approximate - it must have the "%" qualifier to be considered uncertain and approximate. + +```python +>>> parse_edtf("2006-06-11") +Date: '2006-06-11' +>>> parse_edtf("2006-06-11").is_uncertain +False +>>> parse_edtf("2006-06-11").is_approximate +False + +>>> parse_edtf("1984?") +UncertainOrApproximate: '1984?' +>>> parse_edtf("1984?").is_approximate +False +>>> parse_edtf("1984?").is_uncertain +True +>>> parse_edtf("1984?").is_uncertain_and_approximate +False + +>>> parse_edtf("1984%").is_uncertain +False +>>> parse_edtf("1984%").is_uncertain_and_approximate +True + +>>> parse_edtf("1984~/2004-06") +Level1Interval: '1984~/2004-06' +>>> parse_edtf("1984~/2004-06").is_approximate +True +>>> parse_edtf("1984~/2004-06").is_uncertain +False + +>>> parse_edtf("2004?-~06-~04") +PartialUncertainOrApproximate: '2004?-~06-~04' +>>> parse_edtf("2004?-~06-~04").is_approximate +True +>>> parse_edtf("2004?-~06-~04").is_uncertain +True +>>> parse_edtf("2004?-~06-~04").is_uncertain_and_approximate +False +``` + ### Seasons -Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in `appsettings.py`. +> [!IMPORTANT] +> Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in [`appsettings.py`](edtf/appsettings.py). ### Comparisons -Two EDTF dates are considered equal if their unicode() representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. +Two EDTF dates are considered equal if their `str()` representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. ## Django ORM field @@ -335,54 +457,64 @@ To store a natural language value on your model, define another field, and set t When your model is saved, the `natural_text_field` value will be parsed to set the `date_edtf` value, and the underlying EDTF object will set the `_earliest` and `_latest` fields on the model to a float value representing the Julian Date. - -**WARNING**: The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should **not** be used for definitive storage or for display after roundtrip conversions. +> [!WARNING] +> The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should __not__ be used for definitive storage or for display after roundtrip conversions. Example usage: - from django.db import models - from edtf.fields import EDTFField - - class MyModel(models.Model): - date_display = models.CharField( - "Date of creation (display)", - blank=True, - max_length=255, - ) - date_edtf = EDTFField( - "Date of creation (EDTF)", - natural_text_field='date_display', - lower_fuzzy_field='date_earliest', - upper_fuzzy_field='date_latest', - lower_strict_field='date_sort_ascending', - upper_strict_field='date_sort_descending', - blank=True, - null=True, - ) - # use for filtering - date_earliest = models.FloatField(blank=True, null=True) - date_latest = models.FloatField(blank=True, null=True) - # use for sorting - date_sort_ascending = models.FloatField(blank=True, null=True) - date_sort_descending = models.FloatField(blank=True, null=True) - +```python +from django.db import models +from edtf.fields import EDTFField + +class MyModel(models.Model): + date_display = models.CharField( + "Date of creation (display)", + blank=True, + max_length=255, + ) + date_edtf = EDTFField( + "Date of creation (EDTF)", + natural_text_field='date_display', + lower_fuzzy_field='date_earliest', + upper_fuzzy_field='date_latest', + lower_strict_field='date_sort_ascending', + upper_strict_field='date_sort_descending', + blank=True, + null=True, + ) + # use for filtering + date_earliest = models.FloatField(blank=True, null=True) + date_latest = models.FloatField(blank=True, null=True) + # use for sorting + date_sort_ascending = models.FloatField(blank=True, null=True) + date_sort_descending = models.FloatField(blank=True, null=True) +``` Since the `EDTFField` and the `_earliest` and `_latest` field values are set automatically, you may want to make them readonly, or not visible in your model admin. ## To develop + ### Setup -- Clone the repository: `git clone https://github.com/ixc/python-edtf.git` -- Set up a virtual environment: `python3 -m venv venv` -- Install the dependencies: `pip install -r dev-requirements.txt` -- Install precommit hooks: `pre-commit install` + +* Clone the repository: `git clone https://github.com/ixc/python-edtf.git` +* Set up a virtual environment: `python3 -m venv venv` +* Install the dependencies: `pip install -r dev-requirements.txt` +* Install precommit hooks: `pre-commit install` ### Running tests -- From `python-edtf`, run the unit tests: `pytest` -- From `python-edtf/edtf_django_tests`, run the integration tests: `python manage.py test edtf_integration` -- To run CI locally, use `act`, e.g. `act pull_request` or `act --pull=false --container-architecture linux/amd64`. Some steps may require a Github PAT: `act pull_request --container-architecture linux/amd64 --pull=false -s GITHUB_TOKEN=` + +* From `python-edtf`, run the unit tests: `pytest` +* From `python-edtf`, run `pytest -m benchmark` to run the benchmarks (published [here]( https://ixc.github.io/python-edtf/dev/bench/)) +* From `python-edtf/edtf_django_tests`, run the integration tests: `python manage.py test edtf_integration` +* To run CI locally, use `act`, e.g. `act pull_request` or `act --pull=false --container-architecture linux/amd64`. Some steps may require a GitHub PAT: `act pull_request --container-architecture linux/amd64 --pull=false -s GITHUB_TOKEN=` ### Linting and formatting -- Check linting: `ruff check --output-format=github --config pyproject.toml` -- Check formatting: `ruff format --check --config pyproject.toml` -- Fix formatting: `ruff format --config pyproject.toml` -- Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. + +* Check linting: `ruff check --output-format=github --config pyproject.toml` +* Check formatting: `ruff format --check --config pyproject.toml` +* Fix formatting: `ruff format --config pyproject.toml` +* Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. + +### Coverage and benchmarks + +Coverage reports are generated and added as comments to commits, and also visible in the actions log. Benchmarks are run on pull requests and are published [here]( https://ixc.github.io/python-edtf/dev/bench/) and also visible in the actions log. diff --git a/combine_junit.py b/combine_junit.py new file mode 100644 index 0000000..5e3a05b --- /dev/null +++ b/combine_junit.py @@ -0,0 +1,23 @@ +import sys + +from junitparser import JUnitXml + + +def combine_junit_xml(output_file, *input_files): + combined_xml = JUnitXml() + for input_file in input_files: + xml = JUnitXml.fromfile(input_file) + combined_xml.extend(xml) + combined_xml.write(output_file) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Usage: python combine_junit_xml.py ... " + ) + sys.exit(1) + + output_file = sys.argv[1] + input_files = sys.argv[2:] + combine_junit_xml(output_file, *input_files) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1e37df5..c27d485 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,8 @@ -r requirements.txt # Include all main requirements django>=4.2,<5.0 pytest +pytest-benchmark +pytest-cov +pytest-django ruff pre-commit diff --git a/edtf/__init__.py b/edtf/__init__.py index 7bb2885..0b0bfbf 100644 --- a/edtf/__init__.py +++ b/edtf/__init__.py @@ -22,6 +22,7 @@ UncertainOrApproximate, Unspecified, UnspecifiedIntervalSection, + is_valid_edtf, parse_edtf, ) @@ -46,6 +47,7 @@ "trim_struct_time", "text_to_edtf", "parse_edtf", + "is_valid_edtf", # parser_exceptions "EDTFParseException", # parser_classes diff --git a/edtf/appsettings.py b/edtf/appsettings.py index e1bc821..e9b4d9d 100644 --- a/edtf/appsettings.py +++ b/edtf/appsettings.py @@ -12,7 +12,7 @@ except ImportError: EDTF = {} -SEASON_MONTHS_RANGE = EDTF.get( +SEASON_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -27,7 +27,7 @@ }, ) -SEASON_L2_MONTHS_RANGE = EDTF.get( +SEASON_L2_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_L2_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -67,9 +67,9 @@ }, ) -DAY_FIRST = EDTF.get("DAY_FIRST", False) # Americans! +DAY_FIRST: bool = EDTF.get("DAY_FIRST", False) # Americans! -SEASONS = EDTF.get( +SEASONS: dict[int, str] = EDTF.get( "SEASONS", { 21: "spring", @@ -78,16 +78,38 @@ 24: "winter", }, ) -INVERSE_SEASONS = EDTF.get("INVERSE_SEASONS", {v: k for k, v in SEASONS.items()}) +INVERSE_SEASONS: dict[str, int] = EDTF.get( + "INVERSE_SEASONS", {v: k for k, v in SEASONS.items()} +) # also need to interpret `fall` INVERSE_SEASONS["fall"] = 23 # changing these will break tests -PADDING_DAY_PRECISION = EDTF.get("PADDING_DAY_PRECISION", relativedelta(days=1)) -PADDING_MONTH_PRECISION = EDTF.get("PADDING_MONTH_PRECISION", relativedelta(months=1)) -PADDING_YEAR_PRECISION = EDTF.get("PADDING_YEAR_PRECISION", relativedelta(years=1)) -PADDING_SEASON_PRECISION = EDTF.get("PADDING_SEASON_PRECISION", relativedelta(weeks=12)) -MULTIPLIER_IF_UNCERTAIN = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) -MULTIPLIER_IF_APPROXIMATE = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) -MULTIPLIER_IF_BOTH = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) -DELTA_IF_UNKNOWN = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +PADDING_DAY_PRECISION: relativedelta = EDTF.get( + "PADDING_DAY_PRECISION", relativedelta(days=1) +) +PADDING_MONTH_PRECISION: relativedelta = EDTF.get( + "PADDING_MONTH_PRECISION", relativedelta(months=1) +) +PADDING_YEAR_PRECISION: relativedelta = EDTF.get( + "PADDING_YEAR_PRECISION", relativedelta(years=1) +) +PADDING_SEASON_PRECISION: relativedelta = EDTF.get( + "PADDING_SEASON_PRECISION", relativedelta(weeks=12) +) +PADDING_DECADE_PRECISION: relativedelta = EDTF.get( + "PADDING_DECADE_PRECISION", relativedelta(years=10) +) +PADDING_CENTURY_PRECISION: relativedelta = EDTF.get( + "PADDING_CENTURY_PRECISION", relativedelta(years=100) +) +PADDING_MILLENNIUM_PRECISION: relativedelta = EDTF.get( + "PADDING_MILLENNIUM_PRECISION", relativedelta(years=1000) +) +MULTIPLIER_IF_UNCERTAIN: float = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) +MULTIPLIER_IF_APPROXIMATE: float = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) +MULTIPLIER_IF_BOTH: float = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) +DELTA_IF_UNKNOWN: relativedelta = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +DELTA_IF_EMPTY: relativedelta = relativedelta(None) + +DEBUG_PYPARSING: bool = False diff --git a/edtf/convert.py b/edtf/convert.py index a294462..c03e2ea 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -21,7 +21,7 @@ def old_specs_to_new_specs_expression(expression): return expression -def dt_to_struct_time(dt): +def dt_to_struct_time(dt) -> struct_time: """ Convert a `datetime.date` or `datetime.datetime` to a `struct_time` representation *with zero values* for data fields that we cannot always @@ -39,11 +39,10 @@ def dt_to_struct_time(dt): return struct_time( [dt.year, dt.month, dt.day] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS ) - else: - raise NotImplementedError(f"Cannot convert {type(dt)} to `struct_time`") + raise NotImplementedError(f"Cannot convert {type(dt)} to `struct_time`") -def struct_time_to_date(st): +def struct_time_to_date(st: struct_time) -> date: """ Return a `datetime.date` representing the provided `struct_time. @@ -52,7 +51,7 @@ def struct_time_to_date(st): return date(*st[:3]) -def struct_time_to_datetime(st): +def struct_time_to_datetime(st: struct_time) -> datetime: """ Return a `datetime.datetime` representing the provided `struct_time. @@ -61,7 +60,7 @@ def struct_time_to_datetime(st): return datetime(*st[:6]) -def trim_struct_time(st, strip_time=False): +def trim_struct_time(st: struct_time, strip_time: bool = False) -> struct_time: """ Return a `struct_time` based on the one provided but with the extra fields `tm_wday`, `tm_yday`, and `tm_isdst` reset to default values. @@ -71,11 +70,10 @@ def trim_struct_time(st, strip_time=False): """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) -def struct_time_to_jd(st): +def struct_time_to_jd(st: struct_time) -> float: """ Return a float number representing the Julian Date for the given `struct_time`. @@ -91,7 +89,7 @@ def struct_time_to_jd(st): return jdutil.date_to_jd(year, month, day) -def jd_to_struct_time(jd): +def jd_to_struct_time(jd: float) -> struct_time: """ Return a `struct_time` converted from a Julian Date float number. @@ -117,7 +115,7 @@ def jd_to_struct_time(jd): return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -153,4 +151,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/fields.py b/edtf/fields.py index f717592..07a9744 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -1,13 +1,16 @@ import pickle +from django.core import checks from django.core.exceptions import FieldDoesNotExist from django.db import models from django.db.models import signals from django.db.models.query_utils import DeferredAttribute +from pyparsing import ParseException from edtf import EDTFObject, parse_edtf from edtf.convert import struct_time_to_date, struct_time_to_jd from edtf.natlang import text_to_edtf +from edtf.parser.edtf_exceptions import EDTFParseException DATE_ATTRS = ( "lower_strict", @@ -46,21 +49,12 @@ def __init__( **kwargs, ): kwargs["max_length"] = 2000 - ( - self.natural_text_field, - self.direct_input_field, - self.lower_strict_field, - self.upper_strict_field, - self.lower_fuzzy_field, - self.upper_fuzzy_field, - ) = ( - natural_text_field, - direct_input_field, - lower_strict_field, - upper_strict_field, - lower_fuzzy_field, - upper_fuzzy_field, - ) + self.natural_text_field = natural_text_field + self.direct_input_field = direct_input_field + self.lower_strict_field = lower_strict_field + self.upper_strict_field = upper_strict_field + self.lower_fuzzy_field = lower_fuzzy_field + self.upper_fuzzy_field = upper_fuzzy_field super().__init__(verbose_name, name, **kwargs) description = ( @@ -72,6 +66,8 @@ def deconstruct(self): name, path, args, kwargs = super().deconstruct() if self.natural_text_field: kwargs["natural_text_field"] = self.natural_text_field + if self.direct_input_field: + kwargs["direct_input_field"] = self.direct_input_field for attr in DATE_ATTRS: field = f"{attr}_field" @@ -132,13 +128,15 @@ def update_values(self, instance, *args, **kwargs): if direct_input and ( existing_value is None or str(existing_value) != direct_input ): - edtf = parse_edtf( - direct_input, fail_silently=True - ) # ParseException if invalid; should this be raised? - # TODO pyparsing.ParseExceptions are very noisy and dumps the whole grammar (see https://github.com/ixc/python-edtf/issues/46) + try: + edtf = parse_edtf( + direct_input, fail_silently=True + ) # ParseException if invalid; should this be raised? + except ParseException as err: + raise EDTFParseException(direct_input, err) from None # set the natural_text (display) field to the direct_input if it is not provided - if natural_text == "": + if not natural_text: setattr(instance, self.natural_text_field, direct_input) elif natural_text: @@ -148,7 +146,7 @@ def update_values(self, instance, *args, **kwargs): ): edtf = parse_edtf( edtf_string, fail_silently=True - ) # potetial ParseException if invalid; should this be raised? + ) # potential ParseException if invalid; should this be raised? else: edtf = existing_value else: @@ -191,3 +189,46 @@ def contribute_to_class(self, cls, name, **kwargs): # Only run post-initialization values update on non-abstract models if not cls._meta.abstract: signals.post_init.connect(self.update_values, sender=cls) + + def check(self, **kwargs): + errors = super().check(**kwargs) + + for field_alias in [ + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ]: + errors.extend(self._check_field(field_alias)) + + return errors + + def _check_field(self, field_alias): + field_name = getattr(self, field_alias, None) + + # Check if the alias value has been provided in the field definition + if not field_name: + return [ + checks.Error( + f"You must specify a '{field_alias}' for EDTFField", + hint=None, + obj=self, + id="python-edtf.EDTF01", + ) + ] + + # Check if the field that is referenced actually exists + try: + self.model._meta.get_field(field_name) + except FieldDoesNotExist: + return [ + checks.Error( + f"'{self.name}' refers to a non-existent '{field_alias}' field: '{field_name}'", + hint=None, + obj=self, + id="python-edtf.EDTF02", + ) + ] + return [] diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 16cd312..b7a2cbb 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -18,7 +18,7 @@ # time deltas if one date is from before 10-15-1582. -def mjd_to_jd(mjd): +def mjd_to_jd(mjd: float) -> float: """ Convert Modified Julian Day to Julian Day. @@ -37,7 +37,7 @@ def mjd_to_jd(mjd): return mjd + 2400000.5 -def jd_to_mjd(jd): +def jd_to_mjd(jd: float) -> float: """ Convert Julian Day to Modified Julian Day @@ -55,7 +55,7 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year, month, day): +def date_to_jd(year: int, month: int, day: float) -> float: """ Convert a date to Julian Day. @@ -117,7 +117,7 @@ def date_to_jd(year, month, day): return jd -def jd_to_date(jd): +def jd_to_date(jd: float) -> tuple: """ Convert Julian Day to date. @@ -175,7 +175,7 @@ def jd_to_date(jd): return year, month, day -def hmsm_to_days(hour=0, min=0, sec=0, micro=0): +def hmsm_to_days(hour: int = 0, min: int = 0, sec: int = 0, micro: int = 0) -> float: """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -262,7 +262,7 @@ def days_to_hmsm(days): return int(hour), int(min), int(sec), int(micro) -def datetime_to_jd(date): +def datetime_to_jd(date: dt.datetime) -> float: """ Convert a `datetime.datetime` object to Julian Day. @@ -291,7 +291,7 @@ def datetime_to_jd(date): return date_to_jd(date.year, date.month, days) -def jd_to_datetime(jd): +def jd_to_datetime(jd: float) -> dt.datetime: """ Convert a Julian Day to an `jdutil.datetime` object. @@ -321,7 +321,7 @@ def jd_to_datetime(jd): return datetime(year, month, day, hour, min, sec, micro) -def timedelta_to_days(td): +def timedelta_to_days(td: dt.timedelta) -> float: """ Convert a `datetime.timedelta` object to a total number of days. @@ -396,7 +396,7 @@ def __sub__(self, other): return jd_to_datetime(combined) - elif isinstance(other, (datetime, dt.datetime)): + elif isinstance(other, datetime | dt.datetime): diff = datetime_to_jd(self) - datetime_to_jd(other) return dt.timedelta(diff) @@ -407,7 +407,7 @@ def __sub__(self, other): raise TypeError(s) def __rsub__(self, other): - if not isinstance(other, (datetime, dt.datetime)): + if not isinstance(other, datetime | dt.datetime): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" raise TypeError(s) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index f6eef54..077ae19 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,10 +1,10 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools import re from datetime import datetime from dateutil.parser import ParserError, parse -from six.moves import xrange from edtf import appsettings @@ -14,19 +14,45 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])" -LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)" -CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" -CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)" +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|^~" +) +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") + +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) + +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") + +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -def text_to_edtf(text): +@functools.lru_cache +def text_to_edtf(text: str) -> str | None: """ Generate EDTF string equivalent of a given natural language date string. """ @@ -36,7 +62,7 @@ def text_to_edtf(text): t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: str | None = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. @@ -44,7 +70,8 @@ def text_to_edtf(text): for split in [",", ";", "or"]: for list_item in t.split(split): # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -52,19 +79,20 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9 + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 if re.search( - r"\b\d\d\d\d$", d1 + PARTIAL_CHECK, d1 ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10 - if re.search(r"\b\d\d\d\d$", d1): + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search( - r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]", - f"{d1}-{d2}", - ) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() d1 = f"{g[0]}C" @@ -74,7 +102,7 @@ def text_to_edtf(text): r2 = text_to_edtf_date(d2) if r1 and r2: - result = r1 + "/" + r2 + result = f"{r1}/{r2}" return result # is it an either/or year "1838/1862" - that has a different @@ -83,7 +111,7 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: return f"[{int_match.group(1)}, {int_match.group(2)}]" @@ -93,21 +121,19 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r"\bbefore\b", t) - is_before = is_before or re.findall(r"\bearlier\b", t) - - is_after = re.findall(r"\bafter\b", t) - is_after = is_after or re.findall(r"\bsince\b", t) - is_after = is_after or re.findall(r"\blater\b", t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"/{result}" # unknown is replaced with null for intervals + result = f"/{result}" elif is_after: - result = f"{result}/" # unknown is replaced with null for intervals + result = f"{result}/" + return result -def text_to_edtf_date(text): +@functools.lru_cache +def text_to_edtf_date(text: str) -> str | None: """ Return EDTF string equivalent of a given natural language date string. @@ -116,37 +142,28 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() - result = "" + result: str = "" - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r"(\d{2}00)s", t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r"\bcirca\b", t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r"\b~\d{4}", t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r"^~", t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -154,32 +171,29 @@ def text_to_edtf_date(text): # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dXX" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t) + result = f"{int(is_century[0][0]) - 1:02d}XX" + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: + if is_century[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass elif is_ce: - result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t) + result = f"{int(is_ce[0][0]):04d}" + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: + if is_ce[0][-1] in ("bc", "bce"): result = f"-{result}" except IndexError: pass else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. @@ -200,45 +214,45 @@ def text_to_edtf_date(text): ) except ParserError: - return + return None except Exception: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r"\byear\b.+(in|during)\b", t) - mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t) - mentions_day = re.findall(r"\bday\b.+(in|during)\b", t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) - for i in xrange(len(date1)): + for i, char in enumerate(date1): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): result += "X" - elif i == 3 and is_decade > 0: + elif i == 3 and is_decade: if mentions_year: - result += "X" # previously year precision - now just X + result += "X" # year precision else: - result += "X" # previously decade precision - now just X - elif date1[i] == date2[i]: + result += "X" # decade precision + elif char == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default - result += date1[i] + result += char else: # different values were produced, meaning that it's likely - # a default. Use 'X' + # a default. Use 'unspecified' result += "X" # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): + for i in reversed(range(len(result))): if result[i] not in ("X", "-"): smallest_length = 4 diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index 78ecbc9..e0acaad 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -182,6 +182,30 @@ def test_natlang(input_text, expected_output): Verify that the conversion from text to EDTF format matches the expected output. """ result = text_to_edtf(input_text) - assert ( - result == expected_output - ), f"Failed for input: {input_text} - expected {expected_output}, got {result}" + assert result == expected_output, ( + f"Failed for input: {input_text} - expected {expected_output}, got {result}" + ) + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "input_text,expected_output", + [ + ("23rd Dynasty", None), + ("January 2008", "2008-01"), + ("ca1860", "1860~"), + ("uncertain: approx 1862", "1862%"), + ("January", "XXXX-01"), + ("Winter 1872", "1872-24"), + ("before approx January 18 1928", "/1928-01-18~"), + ("birthday in 1872", "1872"), + ("1270 CE", "1270"), + ("2nd century bce", "-01XX"), + ("1858/1860", "[1858, 1860]"), + ], +) +def test_benchmark_natlang(benchmark, input_text, expected_output): + """ + Benchmark selected natural language conversions + """ + benchmark(text_to_edtf, input_text) diff --git a/edtf/parser/__init__.py b/edtf/parser/__init__.py index 43197d5..9cbf3c3 100644 --- a/edtf/parser/__init__.py +++ b/edtf/parser/__init__.py @@ -1,5 +1,5 @@ from .edtf_exceptions import EDTFParseException -from .grammar import parse_edtf +from .grammar import is_valid_edtf, parse_edtf from .parser_classes import ( UA, Consecutives, @@ -26,6 +26,7 @@ __all__ = [ "parse_edtf", + "is_valid_edtf", "EDTFParseException", "EDTFObject", "Date", diff --git a/edtf/parser/edtf_exceptions.py b/edtf/parser/edtf_exceptions.py index 9530602..d906d58 100644 --- a/edtf/parser/edtf_exceptions.py +++ b/edtf/parser/edtf_exceptions.py @@ -2,4 +2,28 @@ class EDTFParseException(ParseException): - pass + """Raised when an input cannot be parsed as an EDTF string. + + Attributes: + input_string - the input string that could not be parsed + err -- the original ParseException that caused this one + """ + + def __init__(self, input_string, err=None): + if input_string is None: + input_string = "" + self.input_string = input_string + if err is None: + err = ParseException(input_string, 0, "Invalid input or format.") + self.err = err + super().__init__(str(err), err.loc if err.loc else 0, self.input_string) + + def __str__(self): + if not self.input_string: + return "You must supply some input text" + near_text = ( + self.input_string[max(self.err.loc - 10, 0) : self.err.loc + 10] + if hasattr(self.err, "loc") + else "" + ) + return f"Error at position {self.err.loc}: Invalid input or format near '{near_text}'. Please provide a valid EDTF string." diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 730f47d..0624a92 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,3 +1,13 @@ +# ruff: noqa: E402 I001 + +# It's recommended to `enablePackrat()` immediately after importing pyparsing +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips + +import pyparsing +from edtf.appsettings import DEBUG_PYPARSING +from edtf.util import remapparams + +pyparsing.ParserElement.enable_packrat() from pyparsing import ( Combine, NotAny, @@ -9,10 +19,11 @@ Word, ZeroOrMore, nums, - oneOf, + one_of, ) from pyparsing import Literal as L + from edtf.parser.edtf_exceptions import EDTFParseException # (* ************************** Level 0 *************************** *) @@ -38,18 +49,19 @@ Unspecified, ) -oneThru12 = oneOf(["%.2d" % i for i in range(1, 13)]) -oneThru13 = oneOf(["%.2d" % i for i in range(1, 14)]) -oneThru23 = oneOf(["%.2d" % i for i in range(1, 24)]) -zeroThru23 = oneOf(["%.2d" % i for i in range(0, 24)]) -oneThru29 = oneOf(["%.2d" % i for i in range(1, 30)]) -oneThru30 = oneOf(["%.2d" % i for i in range(1, 31)]) -oneThru31 = oneOf(["%.2d" % i for i in range(1, 32)]) -oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) -zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) - -positiveDigit = Word(nums, exact=1, excludeChars="0") +oneThru12 = one_of([f"{i:02}" for i in range(1, 13)]) +oneThru13 = one_of([f"{i:02}" for i in range(1, 14)]) +oneThru23 = one_of([f"{i:02}" for i in range(1, 24)]) +zeroThru23 = one_of([f"{i:02}" for i in range(0, 24)]) +oneThru29 = one_of([f"{i:02}" for i in range(1, 30)]) +oneThru30 = one_of([f"{i:02}" for i in range(1, 31)]) +oneThru31 = one_of([f"{i:02}" for i in range(1, 32)]) +oneThru59 = one_of([f"{i:02}" for i in range(1, 60)]) +zeroThru59 = one_of([f"{i:02}" for i in range(0, 60)]) + digit = Word(nums, exact=1) +positiveDigit = Word(nums, exact=1, exclude_chars="0") +positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) second = zeroThru59 minute = zeroThru59 @@ -58,18 +70,23 @@ month = oneThru12("month") monthDay = ( - (oneOf("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) - ^ (oneOf("04 06 09 11")("month") + "-" + oneThru30("day")) + (one_of("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) + ^ (one_of("04 06 09 11")("month") + "-" + oneThru30("day")) ^ (L("02")("month") + "-" + oneThru29("day")) ) +# Significant digits suffix +significantDigits = "S" + Word(nums)("significant_digits") + # 4 digits, 0 to 9 positiveYear = Word(nums, exact=4) # Negative version of positive year, but "-0000" is illegal negativeYear = NotAny(L("-0000")) + ("-" + positiveYear) -year = Combine(positiveYear ^ negativeYear)("year") +year = Combine(positiveYear ^ negativeYear)("year") + Optional(significantDigits) +# simple version for Consecutives +year_basic = Combine(positiveYear ^ negativeYear)("year") yearMonth = year + "-" + month yearMonthDay = year + "-" + monthDay # o hai iso date @@ -77,15 +94,15 @@ date = Combine(year ^ yearMonth ^ yearMonthDay)("date") Date.set_parser(date) -zoneOffsetHour = oneThru13 -zoneOffset = L("Z") ^ ( +zone_offsetHour = oneThru13 +zone_offset = L("Z") ^ ( Regex("[+-]") - + (zoneOffsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) + + (zone_offsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) ) baseTime = Combine(hour + ":" + minute + ":" + second ^ "24:00:00") -time = Combine(baseTime + Optional(zoneOffset))("time") +time = Combine(baseTime + Optional(zone_offset))("time") dateAndTime = date + "T" + time DateAndTime.set_parser(dateAndTime) @@ -99,10 +116,10 @@ # (* ************************** Level 1 *************************** *) # (* ** Auxiliary Assignments for Level 1 ** *) -UASymbol = Combine(oneOf("? ~ %")) +UASymbol = Combine(one_of("? ~ %")) UA.set_parser(UASymbol) -seasonNumber = oneOf("21 22 23 24") +seasonNumber = one_of("21 22 23 24") # (* *** Season (unqualified) *** *) season = year + "-" + seasonNumber("season") @@ -112,9 +129,13 @@ # (* *** Long Year - Simple Form *** *) -longYearSimple = "Y" + Combine( - Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit) -)("year") +longYearSimple = ( + "Y" + + Combine(Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit))( + "year" + ) + + Optional(significantDigits) +) LongYear.set_parser(longYearSimple) # (* *** L1Interval *** *) @@ -131,9 +152,9 @@ def f(toks): l1Start = ".." ^ uaDateOrSeason -l1Start.addParseAction(f) +l1Start.add_parse_action(f) l1End = uaDateOrSeason ^ ".." -l1End.addParseAction(f) +l1End.add_parse_action(f) level1Interval = Optional(l1Start)("lower") + "/" + l1End("upper") ^ l1Start( "lower" @@ -141,17 +162,19 @@ def f(toks): Level1Interval.set_parser(level1Interval) # (* *** unspecified *** *) -yearWithOneOrTwoUnspecifedDigits = Combine(digit + digit + (digit ^ "X") + "X")("year") +yearWithOneOrTwoOrThreeUnspecifedDigits = Combine( + Optional("-") + digit + (digit ^ "X") + (digit ^ "X") + "X" +)("year") monthUnspecified = year + "-" + L("XX")("month") dayUnspecified = yearMonth + "-" + L("XX")("day") dayAndMonthUnspecified = year + "-" + L("XX")("month") + "-" + L("XX")("day") unspecified = ( - yearWithOneOrTwoUnspecifedDigits + yearWithOneOrTwoOrThreeUnspecifedDigits ^ monthUnspecified ^ dayUnspecified ^ dayAndMonthUnspecified -) +) + Optional(UASymbol)("ua") Unspecified.set_parser(unspecified) # (* *** uncertainOrApproxDate *** *) @@ -173,7 +196,7 @@ def f(toks): dayWithX = Combine(("X" + digitOrX) ^ (digitOrX + "X"))("day") # 2-digit month with at least one 'X' present -monthWithX = Combine(oneOf("0X 1X") ^ ("X" + digitOrX))("month") +monthWithX = Combine(one_of("0X 1X") ^ ("X" + digitOrX))("month") # 4-digit year with at least one 'X' present yearWithX = Combine( @@ -238,13 +261,12 @@ def f(toks): seasonQualified = season + "^" + seasonQualifier # (* ** Long Year - Scientific Form ** *) -positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) longYearScientific = ( "Y" + Combine(Optional("-") + positiveInteger)("base") + "E" + positiveInteger("exponent") - + Optional("S" + positiveInteger("precision")) + + Optional(significantDigits) ) ExponentialYear.set_parser(longYearScientific) @@ -260,15 +282,13 @@ def f(toks): ) Level2Interval.set_parser(level2Interval) -# (* ** Masked precision ** *) eliminated in latest specs -# maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year") -# MaskedPrecision.set_parser(maskedPrecision) - # (* ** Inclusive list and choice list** *) consecutives = ( (yearMonthDay("lower") + ".." + yearMonthDay("upper")) ^ (yearMonth("lower") + ".." + yearMonth("upper")) - ^ (year("lower") + ".." + year("upper")) + ^ ( + year_basic("lower") + ".." + year_basic("upper") + ) # using year_basic because some tests were throwing `'list' object has no attribute 'expandtabs'` - somewhere, pyparsing.parse_string() was being passed a list ) Consecutives.set_parser(consecutives) @@ -280,8 +300,8 @@ def f(toks): ^ consecutives ) -earlier = L("..").addParseAction(f)("lower") + date("upper").addParseAction(f) -later = date("lower").addParseAction(f) + L("..").addParseAction(f)("upper") +earlier = L("..").add_parse_action(f)("lower") + date("upper").add_parse_action(f) +later = date("lower").add_parse_action(f) + L("..").add_parse_action(f)("upper") EarlierConsecutives.set_parser(earlier) LaterConsecutives.set_parser(later) @@ -302,7 +322,9 @@ def f(toks): # (* *** L2 Season *** *) -seasonL2Number = oneOf("21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41") +seasonL2Number = one_of( + "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41" +) l2season = year + "-" + seasonL2Number("season") Level2Season.set_parser(l2season) @@ -323,14 +345,32 @@ def f(toks): ) -def parse_edtf(str, parseAll=True, fail_silently=False): +@remapparams(parseAll="parse_all") +def parse_edtf( + input_string: str, + parse_all: bool = True, + fail_silently: bool = False, + debug: bool | None = None, +): + if debug is None: + debug = DEBUG_PYPARSING + + if not input_string: + raise EDTFParseException(input_string) + try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) + p = edtfParser.parse_string(input_string.strip(), parse_all) if p: return p[0] + return None except ParseException as err: if fail_silently: return None - raise EDTFParseException(err) from err + if debug: + raise + raise EDTFParseException(input_string, err) from None + + +def is_valid_edtf(input_string: str) -> bool: + """Returns True if the input string was successfully parsed; False if it isn't.""" + return parse_edtf(input_string, fail_silently=True) is not None diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index bb9a213..67dd8ee 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1,9 +1,10 @@ import calendar import math -import re +from collections.abc import Callable from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta @@ -27,7 +28,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> int: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -49,7 +50,7 @@ def days_in_month(year, month): }[month] -def apply_delta(op, time_struct, delta): +def apply_delta(op: Callable, time_struct: struct_time, delta) -> struct_time: """ Apply a `relativedelta` to a `struct_time` data structure. @@ -75,9 +76,9 @@ def apply_delta(op, time_struct, delta): # Adjust the year to be close to the 2000 millenium in 1,000 year # increments to try and retain accurate relative leap years - actual_year = time_struct.tm_year - millenium = int(float(actual_year) / 1000) - millenium_diff = (2 - millenium) * 1000 + actual_year: int = time_struct.tm_year + millenium: int = int(float(actual_year) / 1000) + millenium_diff: int = (2 - millenium) * 1000 adjusted_year = actual_year + millenium_diff # Apply delta to the date/time with adjusted year dt = datetime(*(adjusted_year,) + time_struct[1:6]) @@ -91,16 +92,19 @@ def apply_delta(op, time_struct, delta): class EDTFObject: """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ parser = None + _is_approximate: bool + _is_uncertain: bool + _uncertain_and_approximate: bool @classmethod def set_parser(cls, p): cls.parser = p - p.addParseAction(cls.parse_action) + p.add_parse_action(cls.parse_action) @classmethod def parse_action(cls, toks): @@ -113,68 +117,69 @@ def parse_action(cls, toks): @classmethod def parse(cls, s): - return cls.parser.parseString(s)[0] + return cls.parser.parse_string(s)[0] - def __repr__(self): + def __repr__(self) -> str: return f"{type(self).__name__}: '{str(self)}'" - def __init__(self, *args, **kwargs): - str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{str} is not implemented.") + def __init__(self, *args, **kwargs) -> None: + message: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{message} is not implemented.") - def __str__(self): + def __str__(self) -> str: raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): raise NotImplementedError - def lower_strict(self): + def lower_strict(self) -> struct_time: return self._strict_date(lean=EARLIEST) - def upper_strict(self): + def upper_strict(self) -> struct_time: return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str) -> relativedelta: """ Subclasses should override this to pad based on how precise they are. """ - return relativedelta(0) + return relativedelta(None) - def get_is_approximate(self): + def get_is_approximate(self) -> bool: return getattr(self, "_is_approximate", False) - def set_is_approximate(self, val): + def set_is_approximate(self, val: bool) -> None: self._is_approximate = val - is_approximate = property(get_is_approximate, set_is_approximate) + is_approximate = property(get_is_approximate, set_is_approximate) # noqa - def get_is_uncertain(self): + def get_is_uncertain(self) -> bool: return getattr(self, "_is_uncertain", False) - def set_is_uncertain(self, val): + def set_is_uncertain(self, val: bool) -> None: self._is_uncertain = val - is_uncertain = property(get_is_uncertain, set_is_uncertain) + is_uncertain = property(get_is_uncertain, set_is_uncertain) # noqa - def get_is_uncertain_and_approximate(self): + def get_is_uncertain_and_approximate(self) -> bool: return getattr(self, "_uncertain_and_approximate", False) - def set_is_uncertain_and_approximate(self, val): + def set_is_uncertain_and_approximate(self, val: bool) -> None: self._uncertain_and_approximate = val is_uncertain_and_approximate = property( - get_is_uncertain_and_approximate, set_is_uncertain_and_approximate + get_is_uncertain_and_approximate, # noqa + set_is_uncertain_and_approximate, # noqa ) - def lower_fuzzy(self): + def lower_fuzzy(self) -> struct_time: strict_val = self.lower_strict() return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - def upper_fuzzy(self): + def upper_fuzzy(self) -> struct_time: strict_val = self.upper_strict() return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) == str(other) elif isinstance(other, date): @@ -183,7 +188,7 @@ def __eq__(self, other): return self._strict_date() == trim_struct_time(other) return False - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) != str(other) elif isinstance(other, date): @@ -192,7 +197,7 @@ def __ne__(self, other): return self._strict_date() != trim_struct_time(other) return True - def __gt__(self, other): + def __gt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() > other.lower_strict() elif isinstance(other, date): @@ -203,7 +208,7 @@ def __gt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __ge__(self, other): + def __ge__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() >= other.lower_strict() elif isinstance(other, date): @@ -214,7 +219,7 @@ def __ge__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __lt__(self, other): + def __lt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() < other.lower_strict() elif isinstance(other, date): @@ -225,7 +230,7 @@ def __lt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __le__(self, other): + def __le__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() <= other.lower_strict() elif isinstance(other, date): @@ -241,81 +246,132 @@ def __le__(self, other): class Date(EDTFObject): - def set_year(self, y): + def __init__( # noqa + self, + year: str | None = None, + month: str | None = None, + day: str | None = None, + significant_digits=None, + **kwargs, + ): + for param in ("date", "lower", "upper"): + if param in kwargs: + self.__init__(**kwargs[param]) + return + + self._year: str | None = ( + year # Year is required, but sometimes passed in as a 'date' dict. + ) + self._month: str | None = month + self._day: str | None = day + self.significant_digits: int | None = ( + int(significant_digits) if significant_digits else None + ) + + def set_year(self, y: str | None): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self): + def get_year(self) -> str | None: return self._year - year = property(get_year, set_year) + year = property(get_year, set_year) # noqa - def set_month(self, m): + def set_month(self, m: str | None): self._month = m if m is None: - self.day = None + self._day = None - def get_month(self): + def get_month(self) -> str | None: return self._month - month = property(get_month, set_month) + month = property(get_month, set_month) # noqa - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ("date", "lower", "upper"): - if param in kwargs: - self.__init__(**kwargs[param]) - return + def set_day(self, d: str | None): + self._day = d + if d is None: + self._day = None - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day + def get_day(self) -> str | None: + return self._day - def __str__(self): - r = self.year - if self.month: - r += f"-{self.month}" - if self.day: - r += f"-{self.day}" + day = property(get_day, set_day) # noqa + + def __str__(self) -> str: + r = f"{self._year}" + if self._month is not None: + r += f"-{self._month}" + if self._day is not None: + r += f"-{self._day}" + if self.significant_digits: + r += f"S{self.significant_digits}" return r - def isoformat(self, default=date.max): - return "%s-%02d-%02d" % ( - self.year, - int(self.month or default.month), - int(self.day or default.day), + def isoformat(self, default=date.max) -> str: + return f"{self._year}-{int(self._month or default.month):02d}-{int(self._day or default.day):02d}" + + def lower_fuzzy(self) -> struct_time: + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) + ) + + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 + insignificant_digits: int = total_digits - self.significant_digits + lower_year: int = ( + i_year // (10**insignificant_digits) * (10**insignificant_digits) ) + return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - def _precise_year(self, lean): + def upper_fuzzy(self) -> struct_time: + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + add, self.upper_strict(), self._get_fuzzy_padding(LATEST) + ) + + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 + insignificant_digits: int = total_digits - self.significant_digits + upper_year: int = (i_year // (10**insignificant_digits) + 1) * ( + 10**insignificant_digits + ) - 1 + return struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def _precise_year(self, lean: str) -> int: # Replace any ambiguous characters in the year string with 0s or 9s + if not self._year: + return 0 + if lean == EARLIEST: - return int(re.sub(r"X", r"0", self.year)) + rep = self._year.replace("X", "0") else: - return int(re.sub(r"X", r"9", self.year)) + rep = self._year.replace("X", "9") - def _precise_month(self, lean): - if self.month and self.month != "XX": + return int(rep) + + def _precise_month(self, lean: str) -> int: + if self._month and self._month != "XX": try: - return int(self.month) + return int(self._month) except ValueError as err: raise ValueError( - f"Couldn't convert {self.month} to int (in {self})" + f"Couldn't convert {self._month} to int (in {self})" ) from err - else: - return 1 if lean == EARLIEST else 12 + return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): - if not self.day or self.day == "XX": + def _precise_day(self, lean: str) -> int: + if not self._day or self._day == "XX": if lean == EARLIEST: return 1 else: return days_in_month( self._precise_year(LATEST), self._precise_month(LATEST) ) - else: - return int(self.day) + return int(self._day) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: """ Return a `time.struct_time` representation of the date. """ @@ -330,36 +386,39 @@ def _strict_date(self, lean): ) @property - def precision(self): - if self.day: + def precision(self) -> str: + if self._day: return PRECISION_DAY - if self.month: + if self._month: return PRECISION_MONTH return PRECISION_YEAR + def estimated(self) -> int: + return self._precise_year(EARLIEST) + class DateAndTime(EDTFObject): - def __init__(self, date, time): - self.date = date + def __init__(self, date: Date, time): # noqa: super raises not implemented + self.date: Date = date self.time = time - def __str__(self): + def __str__(self) -> str: return self.isoformat() - def isoformat(self): + def isoformat(self) -> str: return self.date.isoformat() + "T" + self.time - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: return self.date._strict_date(lean) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() == other.isoformat() elif isinstance(other, struct_time): return self._strict_date() == trim_struct_time(other) return super().__eq__(other) - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() != other.isoformat() elif isinstance(other, struct_time): @@ -368,22 +427,20 @@ def __ne__(self, other): class Interval(EDTFObject): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa: super() raises not implemented self.lower = lower self.upper = upper def __str__(self): return f"{self.lower}/{self.upper}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: if lean == EARLIEST: - r = self.lower._strict_date(lean) - else: - r = self.upper._strict_date(lean) - return r + return self.lower._strict_date(lean) + return self.upper._strict_date(lean) @property - def precision(self): + def precision(self) -> int | None: if self.lower.precision == self.upper.precision: return self.lower.precision return None @@ -398,46 +455,50 @@ def parse_action(cls, toks): args = toks.asList() return cls(*args) - def __init__(self, *args): + def __init__(self, *args) -> None: # noqa: super() raises not implemented if len(args) != 1: raise AssertionError("UA must have exactly one argument") ua = args[0] - self.is_uncertain = "?" in ua - self.is_approximate = "~" in ua - self.is_uncertain_and_approximate = "%" in ua + self.is_uncertain: bool = "?" in ua + self.is_approximate: bool = "~" in ua + self.is_uncertain_and_approximate: bool = "%" in ua - def __str__(self): - d = "" + def __str__(self) -> str: if self.is_uncertain: - d += "?" - if self.is_approximate: - d += "~" - if self.is_uncertain_and_approximate: - d += "%" - return d + return "?" + elif self.is_approximate: + return "~" + elif self.is_uncertain_and_approximate: + return "%" + return "" - def _get_multiplier(self): + def _get_multiplier(self) -> float | None: if self.is_uncertain_and_approximate: return appsettings.MULTIPLIER_IF_BOTH elif self.is_uncertain: return appsettings.MULTIPLIER_IF_UNCERTAIN elif self.is_approximate: return appsettings.MULTIPLIER_IF_APPROXIMATE + return None class UncertainOrApproximate(EDTFObject): - def __init__(self, date, ua): + def __init__(self, date, ua): # noqa: super() raises not implemented self.date = date self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) - def __str__(self): + def __str__(self) -> str: if self.ua: return f"{self.date}{self.ua}" - else: - return str(self.date) + return str(self.date) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> tuple: return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -466,7 +527,7 @@ def _get_fuzzy_padding(self, lean): class UnspecifiedIntervalSection(EDTFObject): - def __init__(self, sectionOpen=False, other_section_element=None): + def __init__(self, sectionOpen=False, other_section_element=None): # noqa: super() raises not implemented if sectionOpen: self.is_open = True self.is_unknown = False @@ -478,22 +539,25 @@ def __init__(self, sectionOpen=False, other_section_element=None): def __str__(self): if self.is_unknown: return "" - else: - return ".." + return ".." + + def _strict_date(self, lean: str = EARLIEST) -> float | None: + if lean not in (EARLIEST, LATEST): + raise ValueError("lean must be one of EARLIEST or LATEST") - def _strict_date(self, lean): if lean == EARLIEST: if self.is_unknown: upper = self.other._strict_date(LATEST) return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) else: return -math.inf - else: + elif lean == LATEST: if self.is_unknown: lower = self.other._strict_date(EARLIEST) return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) else: return math.inf + return None @property def precision(self): @@ -501,11 +565,156 @@ def precision(self): class Unspecified(Date): - pass + def __init__( + self, + year=None, + month=None, + day=None, + significant_digits=None, + ua=None, + **kwargs, + ): + super().__init__( + year=year, + month=month, + day=day, + significant_digits=significant_digits, + **kwargs, + ) + self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) + self.negative = self.year.startswith("-") + + def __str__(self): + base = super().__str__() + if self.ua: + base += str(self.ua) + return base + + def _get_fuzzy_padding(self, lean): + if not self.ua: + return relativedelta() + multiplier = self.ua._get_multiplier() + padding = relativedelta() + + if self.year: + years_padding = self._years_padding(multiplier) + padding += years_padding + if self.month: + padding += relativedelta( + months=int(multiplier * appsettings.PADDING_MONTH_PRECISION.months) + ) + if self.day: + padding += relativedelta( + days=int(multiplier * appsettings.PADDING_DAY_PRECISION.days) + ) + return padding + + def _years_padding(self, multiplier): + """Calculate year padding based on the precision.""" + precision_settings = { + PRECISION_MILLENIUM: appsettings.PADDING_MILLENNIUM_PRECISION.years, + PRECISION_CENTURY: appsettings.PADDING_CENTURY_PRECISION.years, + PRECISION_DECADE: appsettings.PADDING_DECADE_PRECISION.years, + PRECISION_YEAR: appsettings.PADDING_YEAR_PRECISION.years, + } + years = precision_settings.get(self.precision, 0) + return relativedelta(years=int(multiplier * years)) + + def lower_fuzzy(self): + strict_val = ( + self.lower_strict() + ) # negative handled in the lower_strict() override + adjusted = apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + return adjusted + + def upper_fuzzy(self): + strict_val = ( + self.upper_strict() + ) # negative handled in the upper_strict() override + + adjusted = apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + return adjusted + + def lower_strict(self): + if self.negative: + strict_val = self._strict_date( + lean=LATEST + ) # gets the year right, but need to adjust day and month + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 1, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + + return self._strict_date(lean=EARLIEST) + + def upper_strict(self) -> struct_time: + if self.negative: + strict_val = self._strict_date(lean=EARLIEST) + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 12, 31) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + days_in_month = calendar.monthrange( + strict_val.tm_year, strict_val.tm_mon + )[1] + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, days_in_month) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + return self._strict_date(lean=LATEST) + + @property + def precision(self): + if self.day: + return PRECISION_DAY + if self.month: + return PRECISION_MONTH + if self.year: + year_no_symbol = self.year.lstrip("-") + if year_no_symbol.isdigit(): + return PRECISION_YEAR + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XXX"): + return PRECISION_MILLENIUM + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XX"): + return PRECISION_CENTURY + if len(year_no_symbol) == 4 and year_no_symbol.endswith("X"): + return PRECISION_DECADE + raise ValueError(f"Unspecified date {self} has no precision") class Level1Interval(Interval): - def __init__(self, lower=None, upper=None): + def __init__(self, lower: Optional[dict] = None, upper: Optional[dict] = None): # noqa if lower: if lower["date"] == "..": self.lower = UnspecifiedIntervalSection( @@ -528,74 +737,122 @@ def __init__(self, lower=None, upper=None): self.upper = UnspecifiedIntervalSection( False, UncertainOrApproximate(**lower) ) + self.is_approximate: bool = ( + self.lower.is_approximate or self.upper.is_approximate + ) + self.is_uncertain: bool = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean) -> relativedelta | None: if lean == EARLIEST: return self.lower._get_fuzzy_padding(lean) elif lean == LATEST: return self.upper._get_fuzzy_padding(lean) + return None class LongYear(EDTFObject): - def __init__(self, year): - self.year = year + def __init__(self, year: str, significant_digits: str | None = None): # noqa + self.year: str = year + self.significant_digits: int | None = ( + int(significant_digits) if significant_digits else None + ) - def __str__(self): + def __str__(self) -> str: + if self.significant_digits: + return f"Y{self.year}S{self.significant_digits}" return f"Y{self.year}" - def _precise_year(self): + def _precise_year(self) -> int: return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def estimated(self) -> int: + return self._precise_year() + + def lower_fuzzy(self) -> struct_time: + full_year = self._precise_year() + strict_val = self.lower_strict() + if not self.significant_digits: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + + insignificant_digits = len(str(full_year)) - int(self.significant_digits) + if insignificant_digits <= 0: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + lower_year = sig_digits * padding_value + return apply_delta( + sub, + struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(EARLIEST), + ) + + def upper_fuzzy(self) -> struct_time: + full_year = self._precise_year() + strict_val = self.upper_strict() + if not self.significant_digits: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) else: - return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + insignificant_digits = len(str(full_year)) - self.significant_digits + if insignificant_digits <= 0: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + upper_year = (sig_digits + 1) * padding_value - 1 + return apply_delta( + add, + struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(LATEST), + ) class Season(Date): - def __init__(self, year, season, **kwargs): + def __init__(self, year, season, **kwargs): # noqa self.year = year self.season = season # use season to look up month # day isn't part of the 'season' spec, but it helps the inherited # `Date` methods do their thing. self.day = None - def __str__(self): + def __str__(self) -> str: return f"{self.year}-{self.season}" - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: rng = appsettings.SEASON_L2_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) class PartialUncertainOrApproximate(Date): - def set_year(self, y): # Year can be None. - self._year = y - - year = property(Date.get_year, set_year) - - def __init__( + def __init__( # noqa self, year=None, month=None, day=None, - year_ua=False, - month_ua=False, - day_ua=False, - year_month_ua=False, - month_day_ua=False, + year_ua: UA | None = None, + month_ua: UA | None = None, + day_ua: UA | None = None, + year_month_ua: UA | None = None, + month_day_ua: UA | None = None, ssn=None, - season_ua=False, - all_ua=False, - year_ua_b=False, + season_ua: UA | None = None, + all_ua: UA | None = None, + year_ua_b: UA | None = None, ): self.year = year self.month = month @@ -614,7 +871,29 @@ def __init__( self.all_ua = all_ua - def __str__(self): + uas = [ + year_ua, + year_ua_b, + month_ua, + day_ua, + year_month_ua, + month_day_ua, + season_ua, + all_ua, + ] + self.is_uncertain: bool = any( + item.is_uncertain for item in uas if hasattr(item, "is_uncertain") + ) + self.is_approximate: bool = any( + item.is_approximate for item in uas if hasattr(item, "is_approximate") + ) + self.is_uncertain_and_approximate: bool = any( + item.is_uncertain_and_approximate + for item in uas + if hasattr(item, "is_uncertain_and_approximate") + ) + + def __str__(self) -> str: if self.season_ua: return f"{self.season}{self.season_ua}" @@ -623,7 +902,10 @@ def __str__(self): else: y = f"{self.year_ua_b}{self.year}" if self.year_ua_b else str(self.year) - m = f"{self.month_ua}{self.month}" if self.month_ua else str(self.month) + if self.month: + m = f"{self.month_ua}{self.month}" if self.month_ua else str(self.month) + else: + m = None if self.day: d = f"{self.day_ua}{self.day}" if self.day_ua else str(self.day) @@ -639,35 +921,45 @@ def __str__(self): else: result = f"{y}-({m}-{d}){self.month_day_ua}" else: - result = f"{y}-{m}-{d}" if d else f"{y}-{m}" + if d: + result = f"{y}-{m}-{d}" + elif m: + result = f"{y}-{m}" + else: + result = y if self.all_ua: result = f"({result}){self.all_ua}" return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + + year = property(Date.get_year, set_year) # noqa + + def _precise_year(self, lean: str) -> int: if self.season: return self.season._precise_year(lean) return super()._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: if self.season: return self.season._precise_month(lean) return super()._precise_month(lean) - def _precise_day(self, lean): + def _precise_day(self, lean: str) -> int: if self.season: return self.season._precise_day(lean) return super()._precise_day(lean) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str) -> struct_time: """ This is not a perfect interpretation as fuzziness is introduced for redundant uncertainly modifiers e.g. (2006~)~ will get two sets of fuzziness. """ - result = relativedelta(0) + result = relativedelta(None) if self.year_ua: result += ( @@ -729,7 +1021,7 @@ class PartialUnspecified(Unspecified): class Consecutives(Interval): # Treating Consecutive ranges as intervals where one bound is optional - def __init__(self, lower=None, upper=None): + def __init__(self, lower=None, upper=None): # noqa if lower and not isinstance(lower, EDTFObject): self.lower = Date.parse(lower) else: @@ -740,33 +1032,34 @@ def __init__(self, lower=None, upper=None): else: self.upper = upper - def __str__(self): - return "{}..{}".format(self.lower or "", self.upper or "") + def __str__(self) -> str: + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" class LaterConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" class OneOfASet(EDTFObject): + def __init__(self, *args): # noqa + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "[{}]".format(", ".join([str(o) for o in self.objects])) + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"[{out}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> float: strict_dates = [x._strict_date(lean) for x in self.objects] # Accounting for possible 'inf' and '-inf' values if lean == LATEST: @@ -788,61 +1081,69 @@ def _strict_date(self, lean): class MultipleDates(EDTFObject): + def __init__(self, *args): # noqa + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - - def __str__(self): - return "{{{}}}".format(", ".join([str(o) for o in self.objects])) + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"{{{out}}}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST) -> float: if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) - - -class MaskedPrecision(Date): - pass + return min([x._strict_date(lean) for x in self.objects]) class Level2Interval(Level1Interval): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. - if isinstance(lower, (tuple, list)) and len(lower) == 1: + if isinstance(lower, tuple | list) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower - if isinstance(lower, (tuple, list)) and len(upper) == 1: + + if isinstance(lower, tuple | list) and len(upper) == 1: self.upper = upper[0] else: self.upper = upper + self.is_approximate = self.lower.is_approximate or self.upper.is_approximate + self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) + class Level2Season(Season): pass class ExponentialYear(LongYear): - def __init__(self, base, exponent, precision=None): + def __init__(self, base, exponent, significant_digits=None): # noqa self.base = base self.exponent = exponent - self.precision = precision + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) - def _precise_year(self): + def _precise_year(self) -> int: return int(self.base) * 10 ** int(self.exponent) - def get_year(self): - if self.precision: - return f"{self.base}E{self.exponent}S{self.precision}" - else: - return f"{self.base}E{self.exponent}" + def get_year(self) -> str: + if self.significant_digits: + return f"{self.base}E{self.exponent}S{self.significant_digits}" + return f"{self.base}E{self.exponent}" + + year = property(get_year) # noqa - year = property(get_year) + def estimated(self) -> int: + return self._precise_year() diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index 8d9a770..8b3c1d9 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -14,8 +14,8 @@ # where the first value is a tuple, the second item is a tuple of the normalised parse result. # # The values in the second tuple indicate the iso versions of the derived Python `date`s. -# - If there's one other value, all the derived dates should be the same. -# - If there're two other values, then all the lower values should be the same +# - If there is one other value, all the derived dates should be the same. +# - If there are two other values, then all the lower values should be the same # and all the upper values should be the same. # - If there are three other values, then the upper and lower ``_strict`` values # should be the first value, and the upper and lower ``_fuzzy`` values should be @@ -81,10 +81,20 @@ ("1999-01-XX", ("1999-01-01", "1999-01-31")), # some day in 1999 ("1999-XX-XX", ("1999-01-01", "1999-12-31")), + # negative unspecified year + ("-01XX", ("-0199-01-01", "-0100-12-31")), # Uncertain/Approximate lower boundary dates (BCE) ("-0275~", ("-0275-01-01", "-0275-12-31", "-0276-01-01", "-0274-12-31")), ("-0001~", ("-0001-01-01", "-0001-12-31", "-0002-01-01", "0000-12-31")), ("0000~", ("0000-01-01", "0000-12-31", "-0001-01-01", "0001-12-31")), + # Unspecified and qualified + # "circa 17th century" + ("16XX~", ("1600-01-01", "1699-12-31", "1500-01-01", "1799-12-31")), + ("16XX%", ("1600-01-01", "1699-12-31", "1400-01-01", "1899-12-31")), + ("1XXX", ("1000-01-01", "1999-12-31")), + ("1XXX~", ("1000-01-01", "1999-12-31", "0000-01-01", "2999-12-31")), + ("156X~", ("1560-01-01", "1569-12-31", "1550-01-01", "1579-12-31")), + ("-01XX~", ("-0199-01-01", "-0100-12-31", "-0299-01-01", "0000-12-31")), # L1 Extended Interval # beginning unknown, end 2006 # for intervals with an unknown beginning or end, the unknown bound is calculated with the constant DELTA_IF_UNKNOWN (10 years) @@ -193,8 +203,22 @@ # the year -170000000 ("Y-17E7", ("-170000000-01-01", "-170000000-12-31")), # L2 significant digits + # Some year between 1900 and 1999, estimated to be 1950 + ("1950S2", ("1950-01-01", "1950-12-31", "1900-01-01", "1999-12-31")), + ("1953S2", ("1953-01-01", "1953-12-31", "1900-01-01", "1999-12-31")), + ("1953S3", ("1953-01-01", "1953-12-31", "1950-01-01", "1959-12-31")), # Some year between 171010000 and 171999999, estimated to be 171010000 ('S3' indicates a precision of 3 significant digits.) - # ('Y17101E4S3', ('171010000-01-01', '171999999-12-31')), + ( + "Y17101E4S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), + # Some year between 338000 and 338999, estimated to be 338800 + ("Y3388E2S3", ("338800-01-01", "338800-12-31", "338000-01-01", "338999-12-31")), + # some year between 171000000 and 171999999 estimated to be 171010000 + ( + "Y171010000S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), # L2 Seasons # Spring southern hemisphere, 2001 ("2001-29", ("2001-09-01", "2001-11-30")), @@ -202,6 +226,51 @@ ("2001-34", ("2001-04-01", "2001-06-30")), ) +BENCHMARK_EXAMPLES = ( + "2001-02-03", + "2008-12", + "2008", + "-0999", + "2004-01-01T10:10:10+05:00", + "-2005/-1999-02", + "/2006", + "?2004-%06", + "[1667, 1760-12]", + "Y3388E2S3", + "2001-29", +) + +APPROXIMATE_UNCERTAIN_EXAMPLES = ( + # first part of tuple is the input EDTF string, second part is a tuple of booleans: + # uncertain ?, approximate ~, both uncertain and approximate % + ("2004", (False, False, False)), + ("2006-06-11", (False, False, False)), + ("-0999", (False, False, False)), + ("1984?", (True, False, False)), + ("2004-06-11?", (True, False, False)), + ("1984~", (False, True, False)), + ("1984%", (False, False, True)), + ("1984~/2004-06", (False, True, False)), + ("2004-%06", (False, False, True)), + ("2004?-~06-~04", (True, True, False)), + ("2004?-06-04", (True, False, False)), + ("2011-~06-~04", (False, True, False)), + ("2004-06-~01/2004-06-~20", (False, True, False)), + ("156X~", (False, True, False)), + ("?1945/1959", (True, False, False)), + ("?1945", (True, False, False)), + ("?1945-01", (True, False, False)), + ("?1945-01-01", (True, False, False)), + ("~1945/1959", (False, True, False)), + ("~1945", (False, True, False)), + ("~1945-01", (False, True, False)), + ("~1945-01-01", (False, True, False)), + ("%1945/1959", (False, False, True)), + ("%1945", (False, False, True)), + ("%1945-01", (False, False, True)), + ("%1945-01-01", (False, False, True)), +) + BAD_EXAMPLES = ( # parentheses are not used for group qualification in the 2018 spec None, @@ -255,51 +324,51 @@ def test_edtf_examples(test_input, expected_tuple): # Unpack expected results based on their count if len(expected_tuple) == 1: - assert ( - result_date == expected_tuple[0] - ), f"Expected {expected_tuple[0]}, got {result_date}" + assert result_date == expected_tuple[0], ( + f"Expected {expected_tuple[0]}, got {result_date}" + ) elif len(expected_tuple) == 2: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) - assert ( - result.lower_strict() == lower_strict - ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" - assert ( - result.upper_strict() == upper_strict - ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) elif len(expected_tuple) == 3: strict_date = iso_to_struct_time(expected_tuple[0]) lower_fuzzy = iso_to_struct_time(expected_tuple[1]) upper_fuzzy = iso_to_struct_time(expected_tuple[2]) - assert ( - result.lower_strict() == strict_date - ), f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" - assert ( - result.upper_strict() == strict_date - ), f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" - assert ( - result.lower_fuzzy() == lower_fuzzy - ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" - assert ( - result.upper_fuzzy() == upper_fuzzy - ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + assert result.lower_strict() == strict_date, ( + f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" + ) + assert result.upper_strict() == strict_date, ( + f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) elif len(expected_tuple) == 4: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) lower_fuzzy = iso_to_struct_time(expected_tuple[2]) upper_fuzzy = iso_to_struct_time(expected_tuple[3]) - assert ( - result.lower_strict() == lower_strict - ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" - assert ( - result.upper_strict() == upper_strict - ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" - assert ( - result.lower_fuzzy() == lower_fuzzy - ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" - assert ( - result.upper_fuzzy() == upper_fuzzy - ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) @pytest.mark.parametrize("bad_input", BAD_EXAMPLES) @@ -309,6 +378,14 @@ def test_non_parsing(bad_input): parse(bad_input) +@pytest.mark.parametrize("bad_input", [None, ""]) +def test_empty_input(bad_input): + """Test that empty input raises a specific exception.""" + with pytest.raises(EDTFParseException) as exc_info: + parse(bad_input) + assert "You must supply some input text" in str(exc_info.value) + + def test_comparisons(): """Test comparisons between parsed EDTF objects and standard dates.""" d1 = parse("1979-08~") @@ -326,3 +403,24 @@ def test_comparisons(): assert d4 == d5 assert d1 < d5 assert d1 > d6 + + +@pytest.mark.benchmark +@pytest.mark.parametrize("test_input", BENCHMARK_EXAMPLES) +def test_benchmark_parser(benchmark, test_input): + """Benchmark parsing of selected EDTF strings.""" + benchmark(parse, test_input) + + +@pytest.mark.parametrize("test_input,expected_tuple", APPROXIMATE_UNCERTAIN_EXAMPLES) +def test_approximate_uncertain(test_input, expected_tuple): + """Test parsing of EDTF strings and check .is_uncertain, .is_approximate, + and .is_uncertain_and_approximate properties. The expected_tuple should have three + values, the first should be a boolean indicating if the date is uncertain, + the second should be a boolean indicating if the date is approximate, and the + third should be a boolean indicating if the date is both uncertain and approximate.""" + result = parse(test_input) + assert isinstance(result, EDTFObject), "Result should be an instance of EDTFObject" + assert result.is_uncertain == expected_tuple[0] + assert result.is_approximate == expected_tuple[1] + assert result.is_uncertain_and_approximate == expected_tuple[2] diff --git a/edtf/py.typed b/edtf/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/edtf/tests.py b/edtf/tests.py index 9812b65..837e580 100644 --- a/edtf/tests.py +++ b/edtf/tests.py @@ -4,6 +4,9 @@ from time import struct_time from edtf import convert +from edtf.parser.edtf_exceptions import EDTFParseException +from edtf.parser.grammar import parse_edtf +from edtf.util import remapparams def test_dt_to_struct_time_for_datetime(): @@ -107,3 +110,75 @@ def test_roll_negative_time_fields(): assert convert._roll_negative_time_fields( year, month, day, hour, minute, second ) == (-102, 5, 24, 21, 41, 47) + + +def test_remapparams(): + @remapparams(parseAll="parse_all") + def parser(s, parse_all=True): + pass + + assert parser.__name__ == "parser" # noqa: S101 + parser("foo") + # this should not warn + parser("foo", parse_all=False) + # this should warn, but only once + for _ in 1, 2: + parser("foo", parseAll=False) + try: + parser("foo", parseAll=False, parse_all=True) + except ValueError: + pass + else: + raise AssertionError("expected ValueError because of duplicated parameters") + + try: + + @remapparams() + def no_remappings(): + pass + except ValueError: + pass + else: + raise AssertionError( + "expected ValueError from @remapparams() because no remappings" + ) + try: + + @remapparams(p1="p2", p2="p3") + def no_remappings(): + pass + except ValueError: + pass + else: + raise AssertionError( + "expected ValueError from @remapparams() because p1 remaps to another remapped parameter" + ) + + +def test_remapparams_parse_edtf(): + edtf_s = "2005-09-24T10:00:00" # ISO8601 example from the EDTF spec + dat = parse_edtf(edtf_s) # implicit parse_all=True + assert dat.isoformat() == edtf_s + assert parse_edtf(edtf_s, parse_all=True).isoformat() == edtf_s + assert parse_edtf(edtf_s, parseAll=True).isoformat() == edtf_s + assert parse_edtf(f"{edtf_s} SNORT", parse_all=False).isoformat() == edtf_s + assert parse_edtf(f"{edtf_s} SNORT", parseAll=False).isoformat() == edtf_s + # make sure parse_all=True fails the SNORT parse + try: + parse_edtf(f"{edtf_s} SNORT") + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") + try: + parse_edtf(f"{edtf_s} SNORT", parse_all=True) + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") + try: + parse_edtf(f"{edtf_s} SNORT", parseAll=True) + except EDTFParseException: + pass + else: + raise AssertionError("expected EDTFParseException") diff --git a/edtf/util.py b/edtf/util.py new file mode 100644 index 0000000..146eec2 --- /dev/null +++ b/edtf/util.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +""" +Assorted utility functions. +""" + +from functools import update_wrapper +from logging import warning +from traceback import extract_stack + + +def remapparams(**remap): + """ + Remap the specified named parameters. + + Example to support an obsolete `parseAll` parameter: + + @remapparams(parseAll='parse_all') + def parse(s, parse_all=True): + + """ + if not remap: + raise ValueError("no parameters specified for remapping") + for old, new in remap.items(): + if new in remap: + raise ValueError(f"{old}={new!r}: {new!r} is also remapped") + + def remapparams_decorator(func): + """The decorator to apply the remappings.""" + # a record of callers whose parameters were remapped + remapped_callers = set() + + def remapparams_wrapper(*a, **kw): + remappings = {} + for param, value in list(kw.items()): + try: + remapped = remap[param] + except KeyError: + continue + if remapped in kw: + raise ValueError( + f"remap {param}= to {remapped}=: this is already present in the keyword arguments" + ) + del kw[param] + kw[remapped] = value + remappings[param] = remapped + if remappings: + caller_frame = extract_stack(limit=2)[-2] + caller_key = caller_frame.filename, caller_frame.lineno + if caller_key not in remapped_callers: + warning( + "call of %s.%s() from %s:%d: remapped the following obsolete parameters: %s", + func.__module__, + func.__name__, + caller_frame.filename, + caller_frame.lineno, + ", ".join( + sorted(f"{old}->{new}" for old, new in remappings.items()) + ), + ) + remapped_callers.add(caller_key) + return func(*a, **kw) + + update_wrapper(remapparams_wrapper, func) + return remapparams_wrapper + + return remapparams_decorator diff --git a/edtf_django_tests/edtf_integration/admin.py b/edtf_django_tests/edtf_integration/admin.py index 846f6b4..3051891 100644 --- a/edtf_django_tests/edtf_integration/admin.py +++ b/edtf_django_tests/edtf_integration/admin.py @@ -1 +1,43 @@ -# Register your models here. +from django.contrib import admin + +from .models import TestEvent + + +class TestEventAdmin(admin.ModelAdmin): + list_display = ( + "date_display", + "date_edtf_direct", + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + search_fields = ("date_display", "date_edtf_direct") + list_filter = ("date_earliest", "date_latest") + readonly_fields = ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + + fieldsets = ( + (None, {"fields": ("date_display", "date_edtf_direct", "date_edtf")}), + ( + "Computed Dates", + { + "classes": ("collapse",), + "fields": ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + ), + }, + ), + ) + + +admin.site.register(TestEvent, TestEventAdmin) diff --git a/edtf_django_tests/edtf_integration/models.py b/edtf_django_tests/edtf_integration/models.py index 5120889..5e66592 100644 --- a/edtf_django_tests/edtf_integration/models.py +++ b/edtf_django_tests/edtf_integration/models.py @@ -49,9 +49,5 @@ def __str__(self) -> str: return ( f"Test Event: {self.date_display=}, " f"{self.date_edtf_direct=}, " - f"{self.date_earliest=}, " - f"{self.date_latest=}, " - f"{self.date_sort_ascending=}, " - f"{self.date_sort_descending=}, " f"{self.date_edtf=}" ) diff --git a/edtf_django_tests/edtf_integration/tests.py b/edtf_django_tests/edtf_integration/tests.py index 88fdca8..aa1bf34 100644 --- a/edtf_django_tests/edtf_integration/tests.py +++ b/edtf_django_tests/edtf_integration/tests.py @@ -74,6 +74,26 @@ def test_date_display(self): self.assertEqual(self.event3.date_display, "2019-11") self.assertEqual(self.event4.date_display, "Approximately August 2018") + def test_date_display_with_none_or_empty_string(self): + """ + Test that the date_display field is correctly populated when the + `natural_date` field is set to empty string (for example, if it + were used with `null=False` in the model definition) or set to + None (if it were used with `null=True`). + """ + event = TestEvent(date_display="") + event.date_edtf_direct = "2020-03-15/2020-04-15" + # Trigger the descriptor to update the date_display field + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + + event = TestEvent(date_display=None) + # Verify date_display is set to None even though the field is `null=False` + self.assertIsNone(event.date_display) + event.date_edtf_direct = "2020-03-15/2020-04-15" + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + def test_comparison(self): # test equality of the same dates self.assertEqual( @@ -102,3 +122,34 @@ def test_comparison(self): self.event2.date_edtf, "2019-11 is less than 2021-05-06", ) + + def test_field_related_field_specification(self): + edtf_field_on_model = TestEvent._meta.get_field("date_edtf") + required_fields = ( + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ) + for field_alias in required_fields: + # Remove the alias from the edtf_field + orig_value = getattr(edtf_field_on_model, field_alias) + setattr(edtf_field_on_model, field_alias, None) + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be an 'alias not specified' error + self.assertEqual(errors[0].id, "python-edtf.EDTF01") + + # Point the alias to a non-existent field + setattr(edtf_field_on_model, field_alias, "fake") + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be a 'non-eixstent field' error + self.assertEqual(errors[0].id, "python-edtf.EDTF02") + + # Repair the field so later tests can still work + setattr(edtf_field_on_model, field_alias, orig_value) diff --git a/pyproject.toml b/pyproject.toml index 8dea9fd..5915bde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,25 @@ [project] name = "edtf" version = "5.0.0" +license = { file = "LICENSE" } +keywords = ['edtf'] dependencies = [ "python-dateutil", - "pyparsing", - "six" + "pyparsing>=3.0.0", ] description = "Python implementation of Library of Congress EDTF (Extended Date Time Format) specification" -requires-python = ">=3.8" -readme = {file = "README.txt", content-type = "text/markdown"} +requires-python = ">=3.10" +readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au"}, { name = "Alastair Weakley"}, + { name = "Greg Turner"}, { name = "James Murty"}, { name = "Mark Finger" }, { name = "Sabine Müller" }, - { name = "Cole Crawford" } + { name = "Cole Crawford" }, + { name = "Klaus Rettinghaus" }, + { name = "Andrew Hankinson", email = "andrew.hankinson@rism.digital" }, ] maintainers = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au" } @@ -32,16 +36,23 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] [project.optional-dependencies] test = [ "django>=4.2,<5.0", "pytest", + "pytest-django", + "pytest-benchmark", "ruff", "pre-commit", "coverage", - "pytest-cov" + "pytest-cov", + "junitparser", + "mypy>=1.15.0", + "pip>=25.1.1", ] [project.urls] @@ -79,8 +90,11 @@ legacy_tox_ini = """ python_files = ["tests.py", "test_*.py", "*_test.py", "*_tests.py"] python_classes = ["Test*", "*Tests"] python_functions = ["test_*"] -addopts = "--ignore=edtf_django_tests/ --cov=edtf --cov-report=xml" -plugins = ["pytest_cov"] +markers = [ + "benchmark: mark a test as a benchmark", +] +addopts = "--ignore=edtf_django_tests/ --cov=edtf -m 'not benchmark'" +plugins = ["pytest_cov", "pytest_benchmark"] [tool.coverage.run] # we run the edtf_integration tests but only care about them testing fields.py in the main package @@ -101,7 +115,7 @@ exclude_lines = [ [tool.ruff] # Python 3.8 -target-version = "py38" +target-version = "py311" extend-exclude = [ '**/migrations/*', @@ -137,4 +151,6 @@ ignore = [ "E501", # Ignore McCabe complexity (for now). "C901", + # Ignore percent format -> format specifier rule for now (pending merge of #73 which resolves them) + "UP031", ] diff --git a/requirements.txt b/requirements.txt index 0ab3a7d..f142bc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ python-dateutil -pyparsing -six +pyparsing >= 3.0.0