From 0ebdf82c24c0700befcde99e7b2b366b5e7fccd1 Mon Sep 17 00:00:00 2001 From: Matt Fulgo Date: Tue, 4 Feb 2025 12:54:26 -0500 Subject: [PATCH] Optimizes iter_lines handling long lines In the previous implementation of iter_lines, lines that spanned chunks could cause a quadratic iteration of the line data. This could result in a much slower read time for files with long lines or lines significantly longer than the chunk size used when reading them. To avoid the issue, the StreamingBody now collects the line data and only constructs the line (copying the bytes into a new buffer) when it encounters a line ending. Closes #2774 --- botocore/response.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/botocore/response.py b/botocore/response.py index ba3fac9bab..0524a28044 100644 --- a/botocore/response.py +++ b/botocore/response.py @@ -54,6 +54,7 @@ def __init__(self, raw_stream, content_length): self._raw_stream = raw_stream self._content_length = content_length self._amount_read = 0 + self._pending_line_data = [] def __del__(self): # Extending destructor in order to preserve the underlying raw_stream. @@ -138,14 +139,28 @@ def iter_lines(self, chunk_size=_DEFAULT_CHUNK_SIZE, keepends=False): This is achieved by reading chunk of bytes (of size chunk_size) at a time from the raw stream, and then yielding lines from there. """ - pending = b'' + + def _maybe_trim(line): + if keepends or not line.endswith(b'\n'): + return line + return line[:-2] if line.endswith(b'\r\n') else line[:-1] + + def _flush(final_segment): + line = b''.join(self._pending_line_data + [final_segment]) + self._pending_line_data = [] + return _maybe_trim(line) + for chunk in self.iter_chunks(chunk_size): - lines = (pending + chunk).splitlines(True) - for line in lines[:-1]: - yield line.splitlines(keepends)[0] - pending = lines[-1] - if pending: - yield pending.splitlines(keepends)[0] + lines = chunk.splitlines(True) + if len(lines) > 1: + for segment in lines[0:-1]: + yield _flush(segment) + if lines[-1].endswith(b'\n'): + yield _flush(lines[-1]) + else: + self._pending_line_data += lines[-1:] + if last := _flush(b''): + yield last def iter_chunks(self, chunk_size=_DEFAULT_CHUNK_SIZE): """Return an iterator to yield chunks of chunk_size bytes from the raw