From 0ebdf82c24c0700befcde99e7b2b366b5e7fccd1 Mon Sep 17 00:00:00 2001
From: Matt Fulgo <matt@fulgo.org>
Date: Tue, 4 Feb 2025 12:54:26 -0500
Subject: [PATCH] Optimizes iter_lines handling long lines

In the previous implementation of iter_lines, lines that spanned chunks
could cause a quadratic iteration of the line data. This could result in
a much slower read time for files with long lines or lines significantly
longer than the chunk size used when reading them.

To avoid the issue, the StreamingBody now collects the line data and
only constructs the line (copying the bytes into a new buffer) when it
encounters a line ending.

Closes #2774
---
 botocore/response.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/botocore/response.py b/botocore/response.py
index ba3fac9bab..0524a28044 100644
--- a/botocore/response.py
+++ b/botocore/response.py
@@ -54,6 +54,7 @@ def __init__(self, raw_stream, content_length):
         self._raw_stream = raw_stream
         self._content_length = content_length
         self._amount_read = 0
+        self._pending_line_data = []
 
     def __del__(self):
         # Extending destructor in order to preserve the underlying raw_stream.
@@ -138,14 +139,28 @@ def iter_lines(self, chunk_size=_DEFAULT_CHUNK_SIZE, keepends=False):
         This is achieved by reading chunk of bytes (of size chunk_size) at a
         time from the raw stream, and then yielding lines from there.
         """
-        pending = b''
+
+        def _maybe_trim(line):
+            if keepends or not line.endswith(b'\n'):
+                return line
+            return line[:-2] if line.endswith(b'\r\n') else line[:-1]
+
+        def _flush(final_segment):
+            line = b''.join(self._pending_line_data + [final_segment])
+            self._pending_line_data = []
+            return _maybe_trim(line)
+
         for chunk in self.iter_chunks(chunk_size):
-            lines = (pending + chunk).splitlines(True)
-            for line in lines[:-1]:
-                yield line.splitlines(keepends)[0]
-            pending = lines[-1]
-        if pending:
-            yield pending.splitlines(keepends)[0]
+            lines = chunk.splitlines(True)
+            if len(lines) > 1:
+                for segment in lines[0:-1]:
+                    yield _flush(segment)
+            if lines[-1].endswith(b'\n'):
+                yield _flush(lines[-1])
+            else:
+                self._pending_line_data += lines[-1:]
+        if last := _flush(b''):
+            yield last
 
     def iter_chunks(self, chunk_size=_DEFAULT_CHUNK_SIZE):
         """Return an iterator to yield chunks of chunk_size bytes from the raw