Implement garbage collections for inactive cache files (#121)

Cache files that weren't accessed in the last 30 days will be automatically garbage collected. This collection happens when the `save_module` is called via a lock system that would make it happen only one time per day.
2025-12-07 21:34:32 +08:00 · 2020-06-02 12:36:05 +03:00
parent 450e9d0a19
commit fe24f0dc1b
4 changed files with 150 additions and 18 deletions
--- a/parso/_compatibility.py
+++ b/parso/_compatibility.py
@@ -2,6 +2,7 @@
 To ensure compatibility from Python ``2.7`` - ``3.3``, a module has been
 created. Clearly there is huge need to use conforming syntax.
 """
+import os
 import sys
 import platform

@@ -44,11 +45,11 @@ def u(string):


 try:
-    # Python 2.7
+    # Python 3.3+
    FileNotFoundError = FileNotFoundError
 except NameError:
-    # Python 3.3+
-    FileNotFoundError = IOError
+    # Python 2.7 (both IOError + OSError)
+    FileNotFoundError = EnvironmentError


 def utf8_repr(func):
@@ -67,3 +68,27 @@ def utf8_repr(func):
        return func
    else:
        return wrapper
+
+if sys.version_info < (3, 5):
+    """
+    A super-minimal shim around listdir that behave like
+    scandir for the information we need.
+    """
+    class _DirEntry:
+
+        def __init__(self, name, basepath):
+            self.name = name
+            self.basepath = basepath
+
+        @property
+        def path(self):
+            return os.path.join(self.basepath, self.name)
+
+        def stat(self):
+            # won't follow symlinks
+            return os.lstat(os.path.join(self.basepath, self.name))
+
+    def scandir(dir):
+        return [_DirEntry(name, dir) for name in os.listdir(dir)]
+else:
+    from os import scandir
--- a/parso/cache.py
+++ b/parso/cache.py
@@ -13,7 +13,8 @@ try:
 except:
    import pickle

-from parso._compatibility import FileNotFoundError
+from parso._compatibility import FileNotFoundError, scandir
+from parso.file_io import FileIO

 LOG = logging.getLogger(__name__)

@@ -21,6 +22,13 @@ _CACHED_FILE_MINIMUM_SURVIVAL = 60 * 10  # 10 minutes
 """
 Cached files should survive at least a few minutes.
 """
+
+_CACHED_FILE_MAXIMUM_SURVIVAL = 60 * 60 * 24 * 30
+"""
+Maximum time for a cached file to survive if it is not
+accessed within.
+"""
+
 _CACHED_SIZE_TRIGGER = 600
 """
 This setting limits the amount of cached files. It's basically a way to start
@@ -82,6 +90,19 @@ On Linux, if environment variable ``$XDG_CACHE_HOME`` is set,
 ``$XDG_CACHE_HOME/parso`` is used instead of the default one.
 """

+_CACHE_CLEAR_THRESHOLD = 60 * 60 * 24
+
+def _get_cache_clear_lock(cache_path = None):
+    """
+    The path where the cache lock is stored.
+
+    Cache lock will prevent continous cache clearing and only allow garbage
+    collection once a day (can be configured in _CACHE_CLEAR_THRESHOLD).
+    """
+    cache_path = cache_path or _get_default_cache_path()
+    return FileIO(os.path.join(cache_path, "PARSO-CACHE-LOCK"))
+
+
 parser_cache = {}


@@ -173,6 +194,7 @@ def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_pat
    _set_cache_item(hashed_grammar, path, item)
    if pickling and path is not None:
        _save_to_file_system(hashed_grammar, path, item, cache_path=cache_path)
+        _remove_cache_and_update_lock(cache_path = cache_path)


 def _save_to_file_system(hashed_grammar, path, item, cache_path=None):
@@ -187,6 +209,46 @@ def clear_cache(cache_path=None):
    parser_cache.clear()


+def clear_inactive_cache(
+    cache_path=None,
+    inactivity_threshold=_CACHED_FILE_MAXIMUM_SURVIVAL,
+):
+    if cache_path is None:
+        cache_path = _get_default_cache_path()
+    if not os.path.exists(cache_path):
+        return False
+    for version_path in os.listdir(cache_path):
+        version_path = os.path.join(cache_path, version_path)
+        if not os.path.isdir(version_path):
+            continue
+        for file in scandir(version_path):
+            if (
+                file.stat().st_atime + _CACHED_FILE_MAXIMUM_SURVIVAL
+                <= time.time()
+            ):
+                try:
+                    os.remove(file.path)
+                except OSError: # silently ignore all failures
+                    continue
+    else:
+        return True
+
+
+def _remove_cache_and_update_lock(cache_path = None):
+    lock = _get_cache_clear_lock(cache_path=cache_path)
+    clear_lock_time = lock.get_last_modified()
+    if (
+        clear_lock_time is None # first time
+        or clear_lock_time + _CACHE_CLEAR_THRESHOLD <= time.time()
+    ):
+        if not lock._touch():
+            # First make sure that as few as possible other cleanup jobs also
+            # get started. There is still a race condition but it's probably
+            # not a big problem.
+            return False
+
+        clear_inactive_cache(cache_path = cache_path)
+
 def _get_hashed_path(hashed_grammar, path, cache_path=None):
    directory = _get_cache_directory_path(cache_path=cache_path)

--- a/parso/file_io.py
+++ b/parso/file_io.py
@@ -1,4 +1,5 @@
 import os
+from parso._compatibility import FileNotFoundError


 class FileIO(object):
@@ -22,6 +23,17 @@ class FileIO(object):
            # Might raise FileNotFoundError, OSError for Python 2
            return None

+    def _touch(self):
+        try:
+            os.utime(self.path, None)
+        except FileNotFoundError:
+            try:
+                file = open(self.path, 'a')
+                file.close()
+            except (OSError, IOError):  # TODO Maybe log this?
+                return False
+        return True
+
    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__, self.path)

--- a/test/test_cache.py
+++ b/test/test_cache.py
@@ -2,13 +2,19 @@
 Test all things related to the ``jedi.cache`` module.
 """

-from os import unlink
+import os
+import os.path

 import pytest
 import time

-from parso.cache import _NodeCacheItem, save_module, load_module, \
-    _get_hashed_path, parser_cache, _load_from_file_system, _save_to_file_system
+from parso.cache import (_CACHED_FILE_MAXIMUM_SURVIVAL, _VERSION_TAG,
+                         _get_cache_clear_lock, _get_hashed_path,
+                         _load_from_file_system, _NodeCacheItem,
+                         _remove_cache_and_update_lock, _save_to_file_system,
+                         clear_inactive_cache, load_module, parser_cache,
+                         save_module)
+from parso._compatibility import is_pypy
 from parso import load_grammar
 from parso import cache
 from parso import file_io
@@ -16,15 +22,13 @@ from parso import parse


@pytest.fixture()
-def isolated_jedi_cache(monkeypatch, tmpdir):
-    """
-    Set `jedi.settings.cache_directory` to a temporary directory during test.
-
-    Same as `clean_jedi_cache`, but create the temporary directory for
-    each test case (scope='function').
-    """
-    monkeypatch.setattr(cache, '_default_cache_path', str(tmpdir))
-
+def isolated_parso_cache(monkeypatch, tmpdir):
+    """Set `parso.cache._default_cache_path` to a temporary directory
+    during the test. """
+    cache_path = str(os.path.join(str(tmpdir), "__parso_cache"))
+    monkeypatch.setattr(cache, '_default_cache_path', cache_path)
+    monkeypatch.setattr(cache, '_get_default_cache_path', lambda *args, **kwargs: cache_path)
+    return cache_path

 def test_modulepickling_change_cache_dir(tmpdir):
    """
@@ -57,7 +61,7 @@ def load_stored_item(hashed_grammar, path, item, cache_path):
    return item


-@pytest.mark.usefixtures("isolated_jedi_cache")
+@pytest.mark.usefixtures("isolated_parso_cache")
 def test_modulepickling_simulate_deleted_cache(tmpdir):
    """
    Tests loading from a cache file after it is deleted.
@@ -84,7 +88,7 @@ def test_modulepickling_simulate_deleted_cache(tmpdir):
    save_module(grammar._hashed, io, module, lines=[])
    assert load_module(grammar._hashed, io) == module

-    unlink(_get_hashed_path(grammar._hashed, path))
+    os.unlink(_get_hashed_path(grammar._hashed, path))
    parser_cache.clear()

    cached2 = load_module(grammar._hashed, io)
@@ -139,3 +143,32 @@ def test_cache_last_used_update(diff_cache, use_file_io):

    node_cache_item = next(iter(parser_cache.values()))[p]
    assert now < node_cache_item.last_used < time.time()
+
+@pytest.mark.skipif(
+    is_pypy, 
+    reason="pickling in pypy is slow, since we don't pickle,"
+           "we never go into path of auto-collecting garbage"
+)
+def test_inactive_cache(tmpdir, isolated_parso_cache):
+    parser_cache.clear()
+    test_subjects = "abcdef"
+    for path in test_subjects:
+        parse('somecode', cache=True, path=os.path.join(str(tmpdir), path))
+    raw_cache_path = os.path.join(isolated_parso_cache, _VERSION_TAG)
+    assert os.path.exists(raw_cache_path)
+    paths = os.listdir(raw_cache_path)
+    a_while_ago = time.time() - _CACHED_FILE_MAXIMUM_SURVIVAL
+    old_paths = set()
+    for path in paths[:len(test_subjects) // 2]: # make certain number of paths old
+        os.utime(os.path.join(raw_cache_path, path), (a_while_ago, a_while_ago))
+        old_paths.add(path)
+    # nothing should be cleared while the lock is on
+    assert os.path.exists(_get_cache_clear_lock().path)
+    _remove_cache_and_update_lock() # it shouldn't clear anything
+    assert len(os.listdir(raw_cache_path)) == len(test_subjects)
+    assert old_paths.issubset(os.listdir(raw_cache_path))
+
+    os.utime(_get_cache_clear_lock().path, (a_while_ago, a_while_ago))
+    _remove_cache_and_update_lock()
+    assert len(os.listdir(raw_cache_path)) == len(test_subjects) // 2
+    assert not old_paths.intersection(os.listdir(raw_cache_path))