Implement garbage collections for inactive cache files (#121)

Cache files that weren't accessed in the last 30 days will be automatically
garbage collected. This collection happens when the `save_module` is called
via a lock system that would make it happen only one time per day.
This commit is contained in:
Batuhan Taskaya
2020-06-02 12:36:05 +03:00
committed by GitHub
parent 450e9d0a19
commit fe24f0dc1b
4 changed files with 150 additions and 18 deletions

View File

@@ -2,6 +2,7 @@
To ensure compatibility from Python ``2.7`` - ``3.3``, a module has been To ensure compatibility from Python ``2.7`` - ``3.3``, a module has been
created. Clearly there is huge need to use conforming syntax. created. Clearly there is huge need to use conforming syntax.
""" """
import os
import sys import sys
import platform import platform
@@ -44,11 +45,11 @@ def u(string):
try: try:
# Python 2.7 # Python 3.3+
FileNotFoundError = FileNotFoundError FileNotFoundError = FileNotFoundError
except NameError: except NameError:
# Python 3.3+ # Python 2.7 (both IOError + OSError)
FileNotFoundError = IOError FileNotFoundError = EnvironmentError
def utf8_repr(func): def utf8_repr(func):
@@ -67,3 +68,27 @@ def utf8_repr(func):
return func return func
else: else:
return wrapper return wrapper
if sys.version_info < (3, 5):
"""
A super-minimal shim around listdir that behave like
scandir for the information we need.
"""
class _DirEntry:
def __init__(self, name, basepath):
self.name = name
self.basepath = basepath
@property
def path(self):
return os.path.join(self.basepath, self.name)
def stat(self):
# won't follow symlinks
return os.lstat(os.path.join(self.basepath, self.name))
def scandir(dir):
return [_DirEntry(name, dir) for name in os.listdir(dir)]
else:
from os import scandir

View File

@@ -13,7 +13,8 @@ try:
except: except:
import pickle import pickle
from parso._compatibility import FileNotFoundError from parso._compatibility import FileNotFoundError, scandir
from parso.file_io import FileIO
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@@ -21,6 +22,13 @@ _CACHED_FILE_MINIMUM_SURVIVAL = 60 * 10 # 10 minutes
""" """
Cached files should survive at least a few minutes. Cached files should survive at least a few minutes.
""" """
_CACHED_FILE_MAXIMUM_SURVIVAL = 60 * 60 * 24 * 30
"""
Maximum time for a cached file to survive if it is not
accessed within.
"""
_CACHED_SIZE_TRIGGER = 600 _CACHED_SIZE_TRIGGER = 600
""" """
This setting limits the amount of cached files. It's basically a way to start This setting limits the amount of cached files. It's basically a way to start
@@ -82,6 +90,19 @@ On Linux, if environment variable ``$XDG_CACHE_HOME`` is set,
``$XDG_CACHE_HOME/parso`` is used instead of the default one. ``$XDG_CACHE_HOME/parso`` is used instead of the default one.
""" """
_CACHE_CLEAR_THRESHOLD = 60 * 60 * 24
def _get_cache_clear_lock(cache_path = None):
"""
The path where the cache lock is stored.
Cache lock will prevent continous cache clearing and only allow garbage
collection once a day (can be configured in _CACHE_CLEAR_THRESHOLD).
"""
cache_path = cache_path or _get_default_cache_path()
return FileIO(os.path.join(cache_path, "PARSO-CACHE-LOCK"))
parser_cache = {} parser_cache = {}
@@ -173,6 +194,7 @@ def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_pat
_set_cache_item(hashed_grammar, path, item) _set_cache_item(hashed_grammar, path, item)
if pickling and path is not None: if pickling and path is not None:
_save_to_file_system(hashed_grammar, path, item, cache_path=cache_path) _save_to_file_system(hashed_grammar, path, item, cache_path=cache_path)
_remove_cache_and_update_lock(cache_path = cache_path)
def _save_to_file_system(hashed_grammar, path, item, cache_path=None): def _save_to_file_system(hashed_grammar, path, item, cache_path=None):
@@ -187,6 +209,46 @@ def clear_cache(cache_path=None):
parser_cache.clear() parser_cache.clear()
def clear_inactive_cache(
cache_path=None,
inactivity_threshold=_CACHED_FILE_MAXIMUM_SURVIVAL,
):
if cache_path is None:
cache_path = _get_default_cache_path()
if not os.path.exists(cache_path):
return False
for version_path in os.listdir(cache_path):
version_path = os.path.join(cache_path, version_path)
if not os.path.isdir(version_path):
continue
for file in scandir(version_path):
if (
file.stat().st_atime + _CACHED_FILE_MAXIMUM_SURVIVAL
<= time.time()
):
try:
os.remove(file.path)
except OSError: # silently ignore all failures
continue
else:
return True
def _remove_cache_and_update_lock(cache_path = None):
lock = _get_cache_clear_lock(cache_path=cache_path)
clear_lock_time = lock.get_last_modified()
if (
clear_lock_time is None # first time
or clear_lock_time + _CACHE_CLEAR_THRESHOLD <= time.time()
):
if not lock._touch():
# First make sure that as few as possible other cleanup jobs also
# get started. There is still a race condition but it's probably
# not a big problem.
return False
clear_inactive_cache(cache_path = cache_path)
def _get_hashed_path(hashed_grammar, path, cache_path=None): def _get_hashed_path(hashed_grammar, path, cache_path=None):
directory = _get_cache_directory_path(cache_path=cache_path) directory = _get_cache_directory_path(cache_path=cache_path)

View File

@@ -1,4 +1,5 @@
import os import os
from parso._compatibility import FileNotFoundError
class FileIO(object): class FileIO(object):
@@ -22,6 +23,17 @@ class FileIO(object):
# Might raise FileNotFoundError, OSError for Python 2 # Might raise FileNotFoundError, OSError for Python 2
return None return None
def _touch(self):
try:
os.utime(self.path, None)
except FileNotFoundError:
try:
file = open(self.path, 'a')
file.close()
except (OSError, IOError): # TODO Maybe log this?
return False
return True
def __repr__(self): def __repr__(self):
return '%s(%s)' % (self.__class__.__name__, self.path) return '%s(%s)' % (self.__class__.__name__, self.path)

View File

@@ -2,13 +2,19 @@
Test all things related to the ``jedi.cache`` module. Test all things related to the ``jedi.cache`` module.
""" """
from os import unlink import os
import os.path
import pytest import pytest
import time import time
from parso.cache import _NodeCacheItem, save_module, load_module, \ from parso.cache import (_CACHED_FILE_MAXIMUM_SURVIVAL, _VERSION_TAG,
_get_hashed_path, parser_cache, _load_from_file_system, _save_to_file_system _get_cache_clear_lock, _get_hashed_path,
_load_from_file_system, _NodeCacheItem,
_remove_cache_and_update_lock, _save_to_file_system,
clear_inactive_cache, load_module, parser_cache,
save_module)
from parso._compatibility import is_pypy
from parso import load_grammar from parso import load_grammar
from parso import cache from parso import cache
from parso import file_io from parso import file_io
@@ -16,15 +22,13 @@ from parso import parse
@pytest.fixture() @pytest.fixture()
def isolated_jedi_cache(monkeypatch, tmpdir): def isolated_parso_cache(monkeypatch, tmpdir):
""" """Set `parso.cache._default_cache_path` to a temporary directory
Set `jedi.settings.cache_directory` to a temporary directory during test. during the test. """
cache_path = str(os.path.join(str(tmpdir), "__parso_cache"))
Same as `clean_jedi_cache`, but create the temporary directory for monkeypatch.setattr(cache, '_default_cache_path', cache_path)
each test case (scope='function'). monkeypatch.setattr(cache, '_get_default_cache_path', lambda *args, **kwargs: cache_path)
""" return cache_path
monkeypatch.setattr(cache, '_default_cache_path', str(tmpdir))
def test_modulepickling_change_cache_dir(tmpdir): def test_modulepickling_change_cache_dir(tmpdir):
""" """
@@ -57,7 +61,7 @@ def load_stored_item(hashed_grammar, path, item, cache_path):
return item return item
@pytest.mark.usefixtures("isolated_jedi_cache") @pytest.mark.usefixtures("isolated_parso_cache")
def test_modulepickling_simulate_deleted_cache(tmpdir): def test_modulepickling_simulate_deleted_cache(tmpdir):
""" """
Tests loading from a cache file after it is deleted. Tests loading from a cache file after it is deleted.
@@ -84,7 +88,7 @@ def test_modulepickling_simulate_deleted_cache(tmpdir):
save_module(grammar._hashed, io, module, lines=[]) save_module(grammar._hashed, io, module, lines=[])
assert load_module(grammar._hashed, io) == module assert load_module(grammar._hashed, io) == module
unlink(_get_hashed_path(grammar._hashed, path)) os.unlink(_get_hashed_path(grammar._hashed, path))
parser_cache.clear() parser_cache.clear()
cached2 = load_module(grammar._hashed, io) cached2 = load_module(grammar._hashed, io)
@@ -139,3 +143,32 @@ def test_cache_last_used_update(diff_cache, use_file_io):
node_cache_item = next(iter(parser_cache.values()))[p] node_cache_item = next(iter(parser_cache.values()))[p]
assert now < node_cache_item.last_used < time.time() assert now < node_cache_item.last_used < time.time()
@pytest.mark.skipif(
is_pypy,
reason="pickling in pypy is slow, since we don't pickle,"
"we never go into path of auto-collecting garbage"
)
def test_inactive_cache(tmpdir, isolated_parso_cache):
parser_cache.clear()
test_subjects = "abcdef"
for path in test_subjects:
parse('somecode', cache=True, path=os.path.join(str(tmpdir), path))
raw_cache_path = os.path.join(isolated_parso_cache, _VERSION_TAG)
assert os.path.exists(raw_cache_path)
paths = os.listdir(raw_cache_path)
a_while_ago = time.time() - _CACHED_FILE_MAXIMUM_SURVIVAL
old_paths = set()
for path in paths[:len(test_subjects) // 2]: # make certain number of paths old
os.utime(os.path.join(raw_cache_path, path), (a_while_ago, a_while_ago))
old_paths.add(path)
# nothing should be cleared while the lock is on
assert os.path.exists(_get_cache_clear_lock().path)
_remove_cache_and_update_lock() # it shouldn't clear anything
assert len(os.listdir(raw_cache_path)) == len(test_subjects)
assert old_paths.issubset(os.listdir(raw_cache_path))
os.utime(_get_cache_clear_lock().path, (a_while_ago, a_while_ago))
_remove_cache_and_update_lock()
assert len(os.listdir(raw_cache_path)) == len(test_subjects) // 2
assert not old_paths.intersection(os.listdir(raw_cache_path))