diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..1af26377c --- /dev/null +++ b/TODO.md @@ -0,0 +1,29 @@ +tables +====== + +```sql +CREATE TABLE IF NOT EXISTS manifests ( + repo TEXT NOT NULL, + rev TEXT NOT NULL, + manifest TEXT NOT NULL, + PRIMARY KEY (repo, rev) +); + +-- `clones` -- ephemeral but helpful for pre-commit.ci to pre-seed ? +CREATE TABLE IF NOT EXISTS clones ( + repo TEXT NOT NULL, + rev TEXT NOT NULL, + path TEXT NOT NULL, + PRIMARY KEY (repo, rev) +); + +CREATE TABLE IF NOT EXISTS installs ( + repo TEXT NOT NULL, + rev TEXT NOT NULL, + language TEXT NOT NULL, + language_version TEXT NOT NULL, + additional_dependencies TEXT NOT NULL, + path TEXT NOT NULL, + PRIMARY KEY (repo, rev, language, language_version, additional_dependencies) +); +``` diff --git a/pre_commit/commands/gc.py b/pre_commit/commands/gc.py index d1941e4bb..74a293cdd 100644 --- a/pre_commit/commands/gc.py +++ b/pre_commit/commands/gc.py @@ -5,5 +5,7 @@ def gc(store: Store) -> int: - output.write_line(f'{store.gc()} repo(s) removed.') + installs, clones = store.gc() + output.write_line(f'{clones} clone(s) removed.') + output.write_line(f'{installs} installs(s) removed.') return 0 diff --git a/pre_commit/store.py b/pre_commit/store.py index 34c5f0d92..a42903878 100644 --- a/pre_commit/store.py +++ b/pre_commit/store.py @@ -1,13 +1,13 @@ from __future__ import annotations import contextlib +import json import logging import os.path import sqlite3 import tempfile from collections.abc import Callable from collections.abc import Generator -from collections.abc import Sequence from typing import Any import pre_commit.constants as C @@ -63,7 +63,7 @@ class Store: def __init__(self, directory: str | None = None) -> None: self.directory = directory or Store.get_default_directory() - self.db_path = os.path.join(self.directory, 'db.db') + self.db_path = os.path.join(self.directory, 'db5.db') self.readonly = ( os.path.exists(self.directory) and not os.access(self.directory, os.W_OK) @@ -84,20 +84,44 @@ def __init__(self, directory: str | None = None) -> None: if os.path.exists(self.db_path): # pragma: no cover (race) return # To avoid a race where someone ^Cs between db creation and - # execution of the CREATE TABLE statement + # execution of the CREATE TABLE statements fd, tmpfile = tempfile.mkstemp(dir=self.directory) # We'll be managing this file ourselves os.close(fd) with self.connect(db_path=tmpfile) as db: db.executescript( - 'CREATE TABLE repos (' + 'CREATE TABLE configs (' + ' path TEXT NOT NULL,' + ' PRIMARY KEY (path)' + ');', + ) + db.executescript( + 'CREATE TABLE manifests (' + ' repo TEXT NOT NULL,' + ' rev TEXT NOT NULL,' + ' manifest TEXT NOT NULL,' + ' PRIMARY KEY (repo, rev)' + ');', + ) + db.executescript( + 'CREATE TABLE clones (' + ' repo TEXT NOT NULL,' + ' rev TEXT NOT NULL,' + ' path TEXT NOT NULL,' + ' PRIMARY KEY (repo, rev)' + ');', + ) + db.executescript( + 'CREATE TABLE installs (' ' repo TEXT NOT NULL,' - ' ref TEXT NOT NULL,' + ' rev TEXT NOT NULL,' + ' language TEXT NOT NULL,' + ' language_version TEXT NOT NULL,' + ' additional_dependencies TEXT NOT NULL,' ' path TEXT NOT NULL,' - ' PRIMARY KEY (repo, ref)' + ' PRIMARY KEY (repo, rev, language, language_version, additional_dependencies)' # noqa: E501 ');', ) - self._create_configs_table(db) # Atomic file move os.replace(tmpfile, self.db_path) @@ -124,29 +148,18 @@ def connect( with db: yield db - @classmethod - def db_repo_name(cls, repo: str, deps: Sequence[str]) -> str: - if deps: - return f'{repo}:{",".join(deps)}' - else: - return repo - def _new_repo( self, repo: str, - ref: str, - deps: Sequence[str], + rev: str, make_strategy: Callable[[str], None], ) -> str: - original_repo = repo - repo = self.db_repo_name(repo, deps) - def _get_result() -> str | None: # Check if we already exist with self.connect() as db: result = db.execute( - 'SELECT path FROM repos WHERE repo = ? AND ref = ?', - (repo, ref), + 'SELECT path FROM clones WHERE repo = ? AND rev = ?', + (repo, rev), ).fetchone() return result[0] if result else None @@ -159,43 +172,52 @@ def _get_result() -> str | None: if result: # pragma: no cover (race) return result - logger.info(f'Initializing environment for {repo}.') + logger.info(f'Cloning {repo}...') - directory = tempfile.mkdtemp(prefix='repo', dir=self.directory) + directory = tempfile.mkdtemp(prefix='clone', dir=self.directory) with clean_path_on_failure(directory): make_strategy(directory) + manifest = clientlib.load_manifest( + os.path.join(directory, C.MANIFEST_FILE), + display_filename=f'({repo})/{C.MANIFEST_FILE}', + ) + by_id = {hook['id']: hook for hook in manifest} # Update our db with the created repo with self.connect() as db: db.execute( - 'INSERT INTO repos (repo, ref, path) VALUES (?, ?, ?)', - [repo, ref, directory], + 'INSERT INTO clones VALUES (?, ?, ?)', + (repo, rev, directory), + ) + db.execute( + 'INSERT INTO manifests VALUES (?, ?, ?)', + (repo, rev, json.dumps(by_id)), ) - clientlib.warn_for_stages_on_repo_init(original_repo, directory) + clientlib.warn_for_stages_on_repo_init(repo, directory) return directory - def _complete_clone(self, ref: str, git_cmd: Callable[..., None]) -> None: + def _complete_clone(self, rev: str, git_cmd: Callable[..., None]) -> None: """Perform a complete clone of a repository and its submodules """ git_cmd('fetch', 'origin', '--tags') - git_cmd('checkout', ref) + git_cmd('checkout', rev) git_cmd('submodule', 'update', '--init', '--recursive') - def _shallow_clone(self, ref: str, git_cmd: Callable[..., None]) -> None: + def _shallow_clone(self, rev: str, git_cmd: Callable[..., None]) -> None: """Perform a shallow clone of a repository and its submodules """ - git_config = 'protocol.version=2' - git_cmd('-c', git_config, 'fetch', 'origin', ref, '--depth=1') + v2 = ('-c', 'protocol.version=2') + git_cmd(*v2, 'fetch', 'origin', rev, '--depth=1') git_cmd('checkout', 'FETCH_HEAD') git_cmd( - '-c', git_config, 'submodule', 'update', '--init', '--recursive', + *v2, 'submodule', 'update', '--init', '--recursive', '--depth=1', ) - def clone(self, repo: str, ref: str, deps: Sequence[str] = ()) -> str: - """Clone the given url and checkout the specific ref.""" + def clone(self, repo: str, rev: str) -> str: + """Clone the given url and checkout the specific rev.""" def clone_strategy(directory: str) -> None: git.init_repo(directory, repo) @@ -205,24 +227,14 @@ def _git_cmd(*args: str) -> None: cmd_output_b('git', *args, cwd=directory, env=env) try: - self._shallow_clone(ref, _git_cmd) + self._shallow_clone(rev, _git_cmd) except CalledProcessError: - self._complete_clone(ref, _git_cmd) - - return self._new_repo(repo, ref, deps, clone_strategy) + self._complete_clone(rev, _git_cmd) - def make_local(self, deps: Sequence[str]) -> str: - return self._new_repo( - 'local', C.LOCAL_REPO_VERSION, deps, _make_local_repo, - ) + return self._new_repo(repo, rev, clone_strategy) - def _create_configs_table(self, db: sqlite3.Connection) -> None: - db.executescript( - 'CREATE TABLE IF NOT EXISTS configs (' - ' path TEXT NOT NULL,' - ' PRIMARY KEY (path)' - ');', - ) + def make_local(self) -> str: + return self._new_repo('local', C.LOCAL_REPO_VERSION, _make_local_repo) def mark_config_used(self, path: str) -> None: if self.readonly: # pragma: win32 no cover @@ -232,13 +244,11 @@ def mark_config_used(self, path: str) -> None: if not os.path.exists(path): return with self.connect() as db: - # TODO: eventually remove this and only create in _create - self._create_configs_table(db) db.execute('INSERT OR IGNORE INTO configs VALUES (?)', (path,)) - def _mark_used_repos( + def _mark_used_installs( self, - all_repos: dict[tuple[str, str], str], + manifests: dict[tuple[str, str], dict[str, dict[str, Any]]], unused_repos: set[tuple[str, str]], repo: dict[str, Any], ) -> None: @@ -282,11 +292,25 @@ def _mark_used_repos( def gc(self) -> int: with self.exclusive_lock(), self.connect() as db: - self._create_configs_table(db) - - repos = db.execute('SELECT repo, ref, path FROM repos').fetchall() - all_repos = {(repo, ref): path for repo, ref, path in repos} - unused_repos = set(all_repos) + all_installs = { + ( + repo, rev, + language, language_version, + tuple(json.loads(deps)), + ): path + for repo, rev, language, language_version, deps, path in db.execute( + 'SELECT repo, rev, language, language_version, deps, path\n' + 'FROM repos' + ).fetchall() + } + unused_installs = set(all_installs) + + manifests = { + (repo, rev): json.loads(manifest) + for repo, rev, manifest in db.execute( + 'SELECT repo, rev, manifest FROM manifests' + ).fetchall() + } configs_rows = db.execute('SELECT path FROM configs').fetchall() configs = [path for path, in configs_rows] @@ -300,7 +324,7 @@ def gc(self) -> int: continue else: for repo in config['repos']: - self._mark_used_repos(all_repos, unused_repos, repo) + self._mark_used_installs(manifests, unused_repos, repo) paths = [(path,) for path in dead_configs] db.executemany('DELETE FROM configs WHERE path = ?', paths) @@ -312,4 +336,9 @@ def gc(self) -> int: for k in unused_repos: rmtree(all_repos[k]) - return len(unused_repos) + res = db.execute('SELECT path FROM clones').fetchall() + clones = [path for path, in res] + db.execute('DELETE FROM clones') + for path in clones: + rmtree(path) + return len(paths), len(unused_repos)