From 989e4bac89cb3a7b8335e9e9d2b1702a0a5786f3 Mon Sep 17 00:00:00 2001 From: Dave Halter Date: Thu, 9 Mar 2017 08:58:57 +0100 Subject: [PATCH] Speed up splitlines. We use the python function again with the modifications we need. I ran it with: python3 -m timeit -n 10000 -s 'from jedi.common import splitlines; x = open("test_regression.py").read()' The speed differences are quite remarkable, it's ~3 times faster: 10000 loops, best of 3: 52.1 usec per loop vs. the old: 10000 loops, best of 3: 148 usec per loop We might need to speedup splitlines with as well. It's probably also a factor 2-3 slower than it should be. --- jedi/common.py | 39 ++++++++++++++++++++++----------------- test/test_common.py | 4 ++++ 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/jedi/common.py b/jedi/common.py index ee60636a..abb271ae 100644 --- a/jedi/common.py +++ b/jedi/common.py @@ -159,24 +159,29 @@ def splitlines(string, keepends=False): also on form feeds. """ if keepends: - # If capturing parentheses are used in pattern, then the text of all - # groups in the pattern are also returned as part of the resulting - # list. - lst = re.split('(\n|\r\n)', string) + lst = string.splitlines(keepends=True) - # Need to merge the new lines with the actual lines. - odd = False - lines = [] - for string in lst: - if odd: - line += string - lines.append(line) - else: - line = string - odd = not odd - if odd: - lines.append(line) - return lines + # We have to merge lines that were broken by form feed characters. + merge = [] + for i, line in enumerate(lst): + if line.endswith('\f'): + merge.append(i) + + for index in reversed(merge): + try: + lst[index] = lst[index] + lst[index + 1] + del lst[index + 1] + except IndexError: + # index + 1 can be empty and therefore there's no need to + # merge. + pass + + # The stdlib's implementation of the end is inconsistent when calling + # it with/without keepends. One time there's an empty string in the + # end, one time there's none. + if string.endswith('\n') or string == '': + lst.append('') + return lst else: return re.split('\n|\r\n', string) diff --git a/test/test_common.py b/test/test_common.py index 8e3aae6b..217cdf52 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -5,9 +5,13 @@ def test_splitlines_no_keepends(): assert splitlines('asd\r\n') == ['asd', ''] assert splitlines('asd\r\n\f') == ['asd', '\f'] assert splitlines('\fasd\r\n') == ['\fasd', ''] + assert splitlines('') == [''] + assert splitlines('\n') == ['', ''] def test_splitlines_keepends(): assert splitlines('asd\r\n', keepends=True) == ['asd\r\n', ''] assert splitlines('asd\r\n\f', keepends=True) == ['asd\r\n', '\f'] assert splitlines('\fasd\r\n', keepends=True) == ['\fasd\r\n', ''] + assert splitlines('', keepends=True) == [''] + assert splitlines('\n', keepends=True) == ['\n', '']