Make sure to limit the amount of cached files parso stores, fixes davidhalter/jedi#1340

2025-12-07 05:14:29 +08:00 · 2020-01-05 23:44:51 +01:00
parent 29b57d93bd
commit 2b0b093276
2 changed files with 87 additions and 2 deletions
--- a/parso/cache.py
+++ b/parso/cache.py
@@ -17,6 +17,21 @@ from parso._compatibility import FileNotFoundError

 LOG = logging.getLogger(__name__)

+_CACHED_FILE_MINIMUM_SURVIVAL = 60 * 10  # 10 minutes
+"""
+Cached files should survive at least a few minutes.
+"""
+_CACHED_SIZE_TRIGGER = 600
+"""
+This setting limits the amount of cached files. It's basically a way to start
+garbage collection.
+
+The reasoning for this limit being as big as it is, is the following:
+
+Numpy, Pandas, Matplotlib and Tensorflow together use about 500 files. This
+makes Jedi use ~500mb of memory. Since we might want a bit more than those few
+libraries, we just increase it a bit.
+"""

 _PICKLE_VERSION = 32
 """
@@ -76,6 +91,7 @@ class _NodeCacheItem(object):
        if change_time is None:
            change_time = time.time()
        self.change_time = change_time
+        self.last_used = change_time


 def load_module(hashed_grammar, file_io, cache_path=None):
@@ -89,6 +105,7 @@ def load_module(hashed_grammar, file_io, cache_path=None):
    try:
        module_cache_item = parser_cache[hashed_grammar][file_io.path]
        if p_time <= module_cache_item.change_time:
+            module_cache_item.last_used = time.time()
            return module_cache_item.node
    except KeyError:
        return _load_from_file_system(
@@ -122,11 +139,27 @@ def _load_from_file_system(hashed_grammar, path, p_time, cache_path=None):
    except FileNotFoundError:
        return None
    else:
-        parser_cache.setdefault(hashed_grammar, {})[path] = module_cache_item
+        _set_cache_item(hashed_grammar, path, module_cache_item)
        LOG.debug('pickle loaded: %s', path)
        return module_cache_item.node


+def _set_cache_item(hashed_grammar, path, module_cache_item):
+    if sum(len(v) for v in parser_cache.values()) >= _CACHED_SIZE_TRIGGER:
+        # Garbage collection of old cache files.
+        # We are basically throwing everything away that hasn't been accessed
+        # in 10 minutes.
+        cutoff_time = time.time() - _CACHED_FILE_MINIMUM_SURVIVAL
+        for key, path_to_item_map in parser_cache.items():
+            parser_cache[key] = {
+                path: node_item
+                for path, node_item in path_to_item_map.items()
+                if node_item.last_used > cutoff_time
+            }
+
+    parser_cache.setdefault(hashed_grammar, {})[path] = module_cache_item
+
+
 def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_path=None):
    path = file_io.path
    try:
@@ -136,7 +169,7 @@ def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_pat
        pickling = False

    item = _NodeCacheItem(module, lines, p_time)
-    parser_cache.setdefault(hashed_grammar, {})[path] = item
+    _set_cache_item(hashed_grammar, path, item)
    if pickling and path is not None:
        _save_to_file_system(hashed_grammar, path, item, cache_path=cache_path)

--- a/test/test_cache.py
+++ b/test/test_cache.py
@@ -5,12 +5,14 @@ Test all things related to the ``jedi.cache`` module.
 from os import unlink

 import pytest
+import time

 from parso.cache import _NodeCacheItem, save_module, load_module, \
    _get_hashed_path, parser_cache, _load_from_file_system, _save_to_file_system
 from parso import load_grammar
 from parso import cache
 from parso import file_io
+from parso import parse


@pytest.fixture()
@@ -87,3 +89,53 @@ def test_modulepickling_simulate_deleted_cache(tmpdir):

    cached2 = load_module(grammar._hashed, io)
    assert cached2 is None
+
+
+def test_cache_limit():
+    def cache_size():
+        return sum(len(v) for v in parser_cache.values())
+
+    try:
+        parser_cache.clear()
+        future_node_cache_item = _NodeCacheItem('bla', [], change_time=time.time() + 10e6)
+        old_node_cache_item = _NodeCacheItem('bla', [], change_time=time.time() - 10e4)
+        parser_cache['some_hash_old'] = {
+            '/path/%s' % i: old_node_cache_item for i in range(300)
+        }
+        parser_cache['some_hash_new'] = {
+            '/path/%s' % i: future_node_cache_item for i in range(300)
+        }
+        assert cache_size() == 600
+        parse('somecode', cache=True, path='/path/somepath')
+        assert cache_size() == 301
+    finally:
+        parser_cache.clear()
+
+
+class _FixedTimeFileIO(file_io.KnownContentFileIO):
+    def __init__(self, path, content, last_modified):
+        super(_FixedTimeFileIO, self).__init__(path, content)
+        self._last_modified = last_modified
+
+    def get_last_modified(self):
+        return self._last_modified
+
+
+@pytest.mark.parametrize('diff_cache', [False, True])
+@pytest.mark.parametrize('use_file_io', [False, True])
+def test_cache_last_used_update(diff_cache, use_file_io):
+    p = '/path/last-used'
+    parser_cache.clear()  # Clear, because then it's easier to find stuff.
+    parse('somecode', cache=True, path=p)
+    node_cache_item = next(iter(parser_cache.values()))[p]
+    now = time.time()
+    assert node_cache_item.last_used < now
+
+    if use_file_io:
+        f = _FixedTimeFileIO(p, 'code', node_cache_item.last_used - 10)
+        parse(file_io=f, cache=True, diff_cache=diff_cache)
+    else:
+        parse('somecode2', cache=True, path=p, diff_cache=diff_cache)
+
+    node_cache_item = next(iter(parser_cache.values()))[p]
+    assert now < node_cache_item.last_used < time.time()