From fcfb002f8bced5872923292296f82695249ee832 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:46:33 +0200
Subject: [PATCH 1/9] taking a stab

---
 Lib/profiling/sampling/gecko_collector.py | 347 +++++++++++++++++-----
 1 file changed, 266 insertions(+), 81 deletions(-)

diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 54392af95000082..dd899c0d745a0f3 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -1,8 +1,11 @@
+import array
 import itertools
+import io
 import json
 import os
 import platform
 import sys
+import tempfile
 import threading
 import time
 
@@ -61,6 +64,102 @@
 PROCESS_TYPE_MAIN = 0
 STACKWALK_DISABLED = 0
 
+# In-memory buffer before spilling to disk
+DEFAULT_SPILL_BUFFER_BYTES = 128 * 1024
+
+
+class TypedSpillColumn:
+    def __init__(self, directory, basename, typecode, *,
+                 buffer_bytes=DEFAULT_SPILL_BUFFER_BYTES):
+        self.path = os.path.join(directory, basename)
+        self.buffer = array.array(typecode)
+        self.max_items = max(1, buffer_bytes // self.buffer.itemsize)
+
+    def append(self, value):
+        self.buffer.append(value)
+        if len(self.buffer) >= self.max_items:
+            self.flush()
+
+    def flush(self):
+        with open(self.path, "ab") as file:
+            self.buffer.tofile(file)
+        self.buffer.clear()
+
+    def iter_chunks(self):
+        typecode = self.buffer.typecode
+        block_bytes = self.max_items * self.buffer.itemsize
+        with open(self.path, "rb") as file:
+            for block in iter(lambda: file.read(block_bytes), b""):
+                chunk = array.array(typecode)
+                chunk.frombytes(block)
+                yield chunk
+
+
+class NDJSONSpillColumn:
+    _encoder = json.JSONEncoder(separators=(",", ":"))
+
+    def __init__(self, directory, basename, *,
+                 buffer_bytes=DEFAULT_SPILL_BUFFER_BYTES):
+        self.path = os.path.join(directory, basename)
+        self.buffer = bytearray()
+        self._buffer_bytes = buffer_bytes
+
+    def append_object(self, data):
+        self.buffer += (self._encoder.encode(data) + "\n").encode()
+        if len(self.buffer) >= self._buffer_bytes:
+            self.flush()
+
+    def flush(self):
+        with open(self.path, "ab") as file:
+            file.write(self.buffer)
+        self.buffer.clear()
+
+    def iter_lines(self):
+        with open(self.path) as file:
+            for line in file:
+                yield line.rstrip("\n")
+
+
+class GeckoThreadSpill:
+    _TYPED_COLUMNS = (
+        ("samples_stack", "samples-stack.bin", "q"),
+        ("samples_time", "samples-time.bin", "d"),
+        ("markers_name", "markers-name.bin", "q"),
+        ("markers_start_time", "markers-start-time.bin", "d"),
+        ("markers_end_time", "markers-end-time.bin", "d"),
+        ("markers_phase", "markers-phase.bin", "B"),
+        ("markers_category", "markers-category.bin", "I"),
+    )
+
+    def __init__(self, directory, tid):
+        prefix = f"thread-{tid}-"
+        for attr, basename, typecode in self._TYPED_COLUMNS:
+            setattr(self, attr, TypedSpillColumn(
+                directory, prefix + basename, typecode))
+        self.markers_data = NDJSONSpillColumn(
+            directory, prefix + "markers-data.ndjson")
+        self.sample_count = 0
+        self.marker_count = 0
+
+    def append_sample(self, stack_index, time_ms):
+        self.samples_stack.append(stack_index)
+        self.samples_time.append(time_ms)
+        self.sample_count += 1
+
+    def append_marker(self, name_idx, start_time, end_time, phase, category, data):
+        self.markers_name.append(name_idx)
+        self.markers_start_time.append(start_time)
+        self.markers_end_time.append(end_time)
+        self.markers_phase.append(phase)
+        self.markers_category.append(category)
+        self.markers_data.append_object(data)
+        self.marker_count += 1
+
+    def prepare_read(self):
+        for attr, _basename, _typecode in self._TYPED_COLUMNS:
+            getattr(self, attr).flush()
+        self.markers_data.flush()
+
 
 class GeckoCollector(Collector):
     aggregating = True
@@ -77,6 +176,9 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
 
         # Per-thread data structures
         self.threads = {}  # tid -> thread data
+        self.spill_dir = None
+        self.thread_spills = {}
+        self.exported = False
 
         # Global tables
         self.libs = []
@@ -151,6 +253,9 @@ def collect(self, stack_frames, timestamps_us=None):
             stack_frames: List of interpreter/thread frame info
             timestamps_us: List of timestamps in microseconds (None for live sampling)
         """
+        if self.exported:
+            raise RuntimeError("cannot append to GeckoCollector after export")
+
         # Handle live sampling (no timestamps provided)
         if timestamps_us is None:
             current_time = (time.monotonic() * 1000) - self.start_time
@@ -259,15 +364,9 @@ def collect(self, stack_frames, timestamps_us=None):
                 stack_index = self._process_stack(thread_data, frames)
 
                 # Add samples with timestamps
-                samples = thread_data["samples"]
-                samples_stack = samples["stack"]
-                samples_time = samples["time"]
-                samples_delay = samples["eventDelay"]
-
+                thread_spill = self.thread_spills[tid]
                 for t in times:
-                    samples_stack.append(stack_index)
-                    samples_time.append(t)
-                    samples_delay.append(None)
+                    thread_spill.append_sample(stack_index, t)
 
                 # Handle opcodes
                 if self.opcodes_enabled and frames:
@@ -294,6 +393,10 @@ def collect(self, stack_frames, timestamps_us=None):
 
     def _create_thread(self, tid, is_main_thread):
         """Create a new thread structure with processed profile format."""
+        if self.spill_dir is None:
+            self.spill_dir = tempfile.TemporaryDirectory()
+
+        self.thread_spills[tid] = GeckoThreadSpill(self.spill_dir.name, tid)
 
         thread = {
             "name": f"Thread-{tid}",
@@ -307,15 +410,6 @@ def _create_thread(self, tid, is_main_thread):
             "tid": tid,
             "processType": "default",
             "processName": "Python Process",
-            # Sample data - processed format with direct arrays
-            "samples": {
-                "stack": [],
-                "time": [],
-                "eventDelay": [],
-                "weight": None,
-                "weightType": "samples",
-                "length": 0,  # Will be updated on export
-            },
             # Stack table - processed format
             "stackTable": {
                 "frame": [],
@@ -366,16 +460,6 @@ def _create_thread(self, tid, is_main_thread):
                 "functionSize": [],
                 "length": 0,
             },
-            # Markers - processed format (arrays)
-            "markers": {
-                "data": [],
-                "name": [],
-                "startTime": [],
-                "endTime": [],
-                "phase": [],
-                "category": [],
-                "length": 0,
-            },
             # Caches for deduplication
             "_stackCache": {},
             "_frameCache": {},
@@ -405,17 +489,10 @@ def _add_marker(self, tid, name, start_time, end_time, category):
         if tid not in self.threads:
             return
 
-        thread_data = self.threads[tid]
         duration = end_time - start_time
 
         name_idx = self._intern_string(name)
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(category)
-        markers["data"].append({
+        self.thread_spills[tid].append_marker(name_idx, start_time, end_time, 1, category, {
             "type": name.replace(" ", ""),
             "duration": duration,
             "tid": tid
@@ -426,20 +503,13 @@ def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset, funcname,
         if tid not in self.threads or opcode is None:
             return
 
-        thread_data = self.threads[tid]
         opcode_info = get_opcode_info(opcode)
         # Use formatted opcode name (with base opcode for specialized ones)
         formatted_opname = format_opcode(opcode)
 
         name_idx = self._intern_string(formatted_opname)
 
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(CATEGORY_OPCODES)
-        markers["data"].append({
+        self.thread_spills[tid].append_marker(name_idx, start_time, end_time, 1, CATEGORY_OPCODES, {
             "type": "Opcode",
             "opcode": opcode,
             "opname": formatted_opname,
@@ -660,7 +730,6 @@ def _finalize_markers(self):
 
     def export(self, filename):
         """Export the profile to a Gecko JSON file."""
-
         if self.sample_count > 0 and self.last_sample_time > 0:
             self.interval = self.last_sample_time / self.sample_count
 
@@ -681,19 +750,31 @@ def spin():
         spinner_thread = threading.Thread(target=spin, daemon=True)
         spinner_thread.start()
 
+        temp_path = None
+        replaced = False
         try:
-            # Finalize any open markers before building profile
-            self._finalize_markers()
-
-            profile = self._build_profile()
-
-            with open(filename, "w") as f:
-                json.dump(profile, f, separators=(",", ":"))
+            self._prepare_for_serialization()
+            output_dir = os.path.dirname(os.path.abspath(filename)) or "."
+            with tempfile.NamedTemporaryFile(
+                "w", dir=output_dir, delete=False
+            ) as file:
+                temp_path = file.name
+                self._stream_profile(file)
+            os.replace(temp_path, filename)
+            replaced = True
         finally:
+            self.exported = True
             stop_spinner.set()
             spinner_thread.join(timeout=1.0)
             # Small delay to ensure the clear happens
             time.sleep(0.01)
+            if temp_path is not None and not replaced:
+                try:
+                    os.unlink(temp_path)
+                except FileNotFoundError:
+                    pass
+            if self.spill_dir is not None:
+                self.spill_dir.cleanup()
 
         print(f"Gecko profile written to {filename}")
         print(
@@ -727,34 +808,18 @@ def _build_marker_schema(self):
 
     def _build_profile(self):
         """Build the complete profile structure in processed format."""
-        # Convert thread data to final format
-        threads = []
-
-        for tid, thread_data in self.threads.items():
-            # Update lengths
-            samples = thread_data["samples"]
-            stack_table = thread_data["stackTable"]
-            frame_table = thread_data["frameTable"]
-            func_table = thread_data["funcTable"]
-            resource_table = thread_data["resourceTable"]
-
-            samples["length"] = len(samples["stack"])
-            stack_table["length"] = len(stack_table["frame"])
-            frame_table["length"] = len(frame_table["func"])
-            func_table["length"] = len(func_table["name"])
-            resource_table["length"] = len(resource_table["name"])
-            thread_data["markers"]["length"] = len(thread_data["markers"]["name"])
-
-            # Clean up internal caches
-            del thread_data["_stackCache"]
-            del thread_data["_frameCache"]
-            del thread_data["_funcCache"]
-            del thread_data["_resourceCache"]
-
-            threads.append(thread_data)
-
-        # Main profile structure in processed format
-        profile = {
+        try:
+            self._prepare_for_serialization()
+            file = io.StringIO()
+            self._stream_profile(file)
+            return json.loads(file.getvalue())
+        finally:
+            self.exported = True
+            if self.spill_dir is not None:
+                self.spill_dir.cleanup()
+
+    def _profile_head(self):
+        return {
             "meta": {
                 "interval": self.interval,
                 "startTime": self.start_time,
@@ -784,7 +849,10 @@ def _build_profile(self):
                 },
             },
             "libs": self.libs,
-            "threads": threads,
+        }
+
+    def _profile_tail(self):
+        return {
             "pages": [],
             "shared": {
                 "stringArray": self.global_strings,
@@ -792,4 +860,121 @@ def _build_profile(self):
             },
         }
 
-        return profile
+    def _prepare_for_serialization(self):
+        if self.exported:
+            raise RuntimeError("GeckoCollector has already been exported")
+        self._finalize_markers()
+        for spill in self.thread_spills.values():
+            spill.prepare_read()
+        for thread_data in self.threads.values():
+            thread_data["stackTable"]["length"] = len(thread_data["stackTable"]["frame"])
+            thread_data["frameTable"]["length"] = len(thread_data["frameTable"]["func"])
+            thread_data["funcTable"]["length"] = len(thread_data["funcTable"]["name"])
+            thread_data["resourceTable"]["length"] = len(thread_data["resourceTable"]["name"])
+
+    def _stream_profile(self, file):
+        head = json.dumps(
+            self._profile_head(), separators=(",", ":"), allow_nan=False
+        )[1:-1]
+        tail = json.dumps(
+            self._profile_tail(), separators=(",", ":"), allow_nan=False
+        )[1:-1]
+        file.write("{")
+        file.write(head)
+        file.write(',"threads":[')
+        for index, (tid, thread_data) in enumerate(self.threads.items()):
+            if index:
+                file.write(",")
+            self._stream_thread(file, tid, thread_data)
+        file.write("],")
+        file.write(tail)
+        file.write("}")
+
+    def _stream_thread(self, file, tid, thread_data):
+        spill = self.thread_spills[tid]
+        metadata = {
+            "name": thread_data["name"],
+            "isMainThread": thread_data["isMainThread"],
+            "processStartupTime": thread_data["processStartupTime"],
+            "processShutdownTime": thread_data["processShutdownTime"],
+            "registerTime": thread_data["registerTime"],
+            "unregisterTime": thread_data["unregisterTime"],
+            "pausedRanges": thread_data["pausedRanges"],
+            "pid": thread_data["pid"],
+            "tid": thread_data["tid"],
+            "processType": thread_data["processType"],
+            "processName": thread_data["processName"],
+        }
+        file.write("{")
+        file.write(json.dumps(metadata, separators=(",", ":"), allow_nan=False)[1:-1])
+        file.write(',"samples":')
+        self._stream_samples(file, spill)
+        for key in (
+            "stackTable",
+            "frameTable",
+            "funcTable",
+            "resourceTable",
+            "nativeSymbols",
+        ):
+            file.write(',"')
+            file.write(key)
+            file.write('":')
+            file.write(json.dumps(
+                thread_data[key], separators=(",", ":"), allow_nan=False
+            ))
+        file.write(',"markers":')
+        self._stream_markers(file, spill)
+        file.write("}")
+
+    def _stream_samples(self, file, spill):
+        file.write('{"stack":')
+        _stream_array(file, _tokens(spill.samples_stack), spill.sample_count)
+        file.write(',"time":')
+        _stream_array(file, _tokens(spill.samples_time), spill.sample_count)
+        file.write(',"eventDelay":')
+        _stream_array(
+            file,
+            ("null" for _ in range(spill.sample_count)),
+            spill.sample_count,
+        )
+        file.write(',"weight":null,"weightType":"samples","length":')
+        file.write(repr(spill.sample_count))
+        file.write("}")
+
+    def _stream_markers(self, file, spill):
+        file.write('{"data":')
+        _stream_array(file, spill.markers_data.iter_lines(), spill.marker_count)
+        file.write(',"name":')
+        _stream_array(file, _tokens(spill.markers_name), spill.marker_count)
+        file.write(',"startTime":')
+        _stream_array(file, _tokens(spill.markers_start_time), spill.marker_count)
+        file.write(',"endTime":')
+        _stream_array(file, _tokens(spill.markers_end_time), spill.marker_count)
+        file.write(',"phase":')
+        _stream_array(file, _tokens(spill.markers_phase), spill.marker_count)
+        file.write(',"category":')
+        _stream_array(file, _tokens(spill.markers_category), spill.marker_count)
+        file.write(',"length":')
+        file.write(repr(spill.marker_count))
+        file.write("}")
+
+
+def _stream_array(file, token_iter, expected_count):
+    file.write("[")
+    count = 0
+    for token in token_iter:
+        if count:
+            file.write(",")
+        file.write(token)
+        count += 1
+    if count != expected_count:
+        raise RuntimeError(
+            f"streamed {count} array items, expected {expected_count}"
+        )
+    file.write("]")
+
+
+def _tokens(column):
+    for chunk in column.iter_chunks():
+        for value in chunk:
+            yield repr(value)

From 57694cb972ba6cb2f031781932a0b82f95d7efaa Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:51:38 +0200
Subject: [PATCH 2/9] news

---
 .../Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst    | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst

diff --git a/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst
new file mode 100644
index 000000000000000..42ed6ad7cd3c65f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst
@@ -0,0 +1,4 @@
+Fix the ``--gecko`` collector in :mod:`profiling.sampling` that kept every
+sample in memory. It now writes sample and marker data to temporary files
+and reads them back, ultimately building the output file at the end. Patch
+by Pablo Galindo and Maurycy Pawłowski-Wieroński.

From e0805d01a604f740d61b7527ecaeff0bb52343cf Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 3 Jun 2026 13:52:50 +0200
Subject: [PATCH 3/9] test

---
 .../test_profiling/test_sampling_profiler/test_collectors.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
index 390a1479fdd2975..8d196a5a334ff8d 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -2702,10 +2702,7 @@ def test_gecko_opcode_state_change_emits_marker(self):
         collector.collect(frames2)
 
         # Should have emitted a marker for the first opcode
-        thread_data = collector.threads[1]
-        markers = thread_data["markers"]
-        # At least one marker should have been added
-        self.assertGreater(len(markers["name"]), 0)
+        self.assertGreater(collector.thread_spills[1].marker_count, 0)
 
     def test_gecko_opcode_markers_not_emitted_when_disabled(self):
         """Test that no opcode markers when opcodes=False."""

From 866b8fde2ff38aeba069d070a613b308d8ba7dca Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 3 Jun 2026 15:15:20 +0200
Subject: [PATCH 4/9] this?

---
 .../test_profiling/test_sampling_profiler/test_collectors.py  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
index 8d196a5a334ff8d..e02b6c07bd4427f 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -2659,6 +2659,7 @@ def test_gecko_collector_opcodes_enabled(self):
     def test_gecko_opcode_state_tracking(self):
         """Test that GeckoCollector tracks opcode state changes."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
+        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         # First sample with opcode 90 (RAISE_VARARGS)
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2680,6 +2681,7 @@ def test_gecko_opcode_state_tracking(self):
     def test_gecko_opcode_state_change_emits_marker(self):
         """Test that opcode state change emits an interval marker."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
+        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         # First sample: opcode 90
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2707,6 +2709,7 @@ def test_gecko_opcode_state_change_emits_marker(self):
     def test_gecko_opcode_markers_not_emitted_when_disabled(self):
         """Test that no opcode markers when opcodes=False."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=False)
+        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
         frames1 = [
@@ -2732,6 +2735,7 @@ def test_gecko_opcode_markers_not_emitted_when_disabled(self):
     def test_gecko_opcode_with_none_opcode(self):
         """Test that None opcode doesn't cause issues."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
+        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         # Frame with no opcode (None)
         frame = MockFrameInfo("test.py", 10, "func", opcode=None)

From a23814bbda140f983c6f76250136e9bfbcc20816 Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Wed, 3 Jun 2026 15:30:33 +0200
Subject: [PATCH 5/9] now?

---
 .../test_profiling/test_sampling_profiler/test_collectors.py    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
index e02b6c07bd4427f..9fe0d8f1bb66ff1 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -624,6 +624,8 @@ def test_gecko_collector_export(self):
         """Test Gecko profile export functionality."""
         gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
         self.addCleanup(close_and_unlink, gecko_out)
+        # We cannot overwrite an open file on Windows.
+        gecko_out.close()
 
         collector = GeckoCollector(1000)
 

From 8d5c68779d658b9b6197411300550c7dbebcf91a Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Thu, 4 Jun 2026 11:57:18 +0200
Subject: [PATCH 6/9] don't be too smart. bye `array`

---
 Lib/profiling/sampling/gecko_collector.py | 103 +++++++++-------------
 1 file changed, 41 insertions(+), 62 deletions(-)

diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index dd899c0d745a0f3..36c47d64298fc1f 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -1,4 +1,3 @@
-import array
 import itertools
 import io
 import json
@@ -68,34 +67,7 @@
 DEFAULT_SPILL_BUFFER_BYTES = 128 * 1024
 
 
-class TypedSpillColumn:
-    def __init__(self, directory, basename, typecode, *,
-                 buffer_bytes=DEFAULT_SPILL_BUFFER_BYTES):
-        self.path = os.path.join(directory, basename)
-        self.buffer = array.array(typecode)
-        self.max_items = max(1, buffer_bytes // self.buffer.itemsize)
-
-    def append(self, value):
-        self.buffer.append(value)
-        if len(self.buffer) >= self.max_items:
-            self.flush()
-
-    def flush(self):
-        with open(self.path, "ab") as file:
-            self.buffer.tofile(file)
-        self.buffer.clear()
-
-    def iter_chunks(self):
-        typecode = self.buffer.typecode
-        block_bytes = self.max_items * self.buffer.itemsize
-        with open(self.path, "rb") as file:
-            for block in iter(lambda: file.read(block_bytes), b""):
-                chunk = array.array(typecode)
-                chunk.frombytes(block)
-                yield chunk
-
-
-class NDJSONSpillColumn:
+class SpillColumn:
     _encoder = json.JSONEncoder(separators=(",", ":"))
 
     def __init__(self, directory, basename, *,
@@ -104,8 +76,8 @@ def __init__(self, directory, basename, *,
         self.buffer = bytearray()
         self._buffer_bytes = buffer_bytes
 
-    def append_object(self, data):
-        self.buffer += (self._encoder.encode(data) + "\n").encode()
+    def append(self, value):
+        self.buffer += (self._encoder.encode(value) + "\n").encode()
         if len(self.buffer) >= self._buffer_bytes:
             self.flush()
 
@@ -114,30 +86,28 @@ def flush(self):
             file.write(self.buffer)
         self.buffer.clear()
 
-    def iter_lines(self):
+    def iter_tokens(self):
         with open(self.path) as file:
             for line in file:
                 yield line.rstrip("\n")
 
 
 class GeckoThreadSpill:
-    _TYPED_COLUMNS = (
-        ("samples_stack", "samples-stack.bin", "q"),
-        ("samples_time", "samples-time.bin", "d"),
-        ("markers_name", "markers-name.bin", "q"),
-        ("markers_start_time", "markers-start-time.bin", "d"),
-        ("markers_end_time", "markers-end-time.bin", "d"),
-        ("markers_phase", "markers-phase.bin", "B"),
-        ("markers_category", "markers-category.bin", "I"),
+    _COLUMNS = (
+        ("samples_stack", "samples-stack.json"),
+        ("samples_time", "samples-time.json"),
+        ("markers_name", "markers-name.json"),
+        ("markers_start_time", "markers-start-time.json"),
+        ("markers_end_time", "markers-end-time.json"),
+        ("markers_phase", "markers-phase.json"),
+        ("markers_category", "markers-category.json"),
+        ("markers_data", "markers-data.json"),
     )
 
     def __init__(self, directory, tid):
         prefix = f"thread-{tid}-"
-        for attr, basename, typecode in self._TYPED_COLUMNS:
-            setattr(self, attr, TypedSpillColumn(
-                directory, prefix + basename, typecode))
-        self.markers_data = NDJSONSpillColumn(
-            directory, prefix + "markers-data.ndjson")
+        for attr, basename in self._COLUMNS:
+            setattr(self, attr, SpillColumn(directory, prefix + basename))
         self.sample_count = 0
         self.marker_count = 0
 
@@ -152,13 +122,12 @@ def append_marker(self, name_idx, start_time, end_time, phase, category, data):
         self.markers_end_time.append(end_time)
         self.markers_phase.append(phase)
         self.markers_category.append(category)
-        self.markers_data.append_object(data)
+        self.markers_data.append(data)
         self.marker_count += 1
 
     def prepare_read(self):
-        for attr, _basename, _typecode in self._TYPED_COLUMNS:
+        for attr, _basename in self._COLUMNS:
             getattr(self, attr).flush()
-        self.markers_data.flush()
 
 
 class GeckoCollector(Collector):
@@ -928,9 +897,13 @@ def _stream_thread(self, file, tid, thread_data):
 
     def _stream_samples(self, file, spill):
         file.write('{"stack":')
-        _stream_array(file, _tokens(spill.samples_stack), spill.sample_count)
+        _stream_array(
+            file, spill.samples_stack.iter_tokens(), spill.sample_count
+        )
         file.write(',"time":')
-        _stream_array(file, _tokens(spill.samples_time), spill.sample_count)
+        _stream_array(
+            file, spill.samples_time.iter_tokens(), spill.sample_count
+        )
         file.write(',"eventDelay":')
         _stream_array(
             file,
@@ -943,17 +916,29 @@ def _stream_samples(self, file, spill):
 
     def _stream_markers(self, file, spill):
         file.write('{"data":')
-        _stream_array(file, spill.markers_data.iter_lines(), spill.marker_count)
+        _stream_array(
+            file, spill.markers_data.iter_tokens(), spill.marker_count
+        )
         file.write(',"name":')
-        _stream_array(file, _tokens(spill.markers_name), spill.marker_count)
+        _stream_array(
+            file, spill.markers_name.iter_tokens(), spill.marker_count
+        )
         file.write(',"startTime":')
-        _stream_array(file, _tokens(spill.markers_start_time), spill.marker_count)
+        _stream_array(
+            file, spill.markers_start_time.iter_tokens(), spill.marker_count
+        )
         file.write(',"endTime":')
-        _stream_array(file, _tokens(spill.markers_end_time), spill.marker_count)
+        _stream_array(
+            file, spill.markers_end_time.iter_tokens(), spill.marker_count
+        )
         file.write(',"phase":')
-        _stream_array(file, _tokens(spill.markers_phase), spill.marker_count)
+        _stream_array(
+            file, spill.markers_phase.iter_tokens(), spill.marker_count
+        )
         file.write(',"category":')
-        _stream_array(file, _tokens(spill.markers_category), spill.marker_count)
+        _stream_array(
+            file, spill.markers_category.iter_tokens(), spill.marker_count
+        )
         file.write(',"length":')
         file.write(repr(spill.marker_count))
         file.write("}")
@@ -972,9 +957,3 @@ def _stream_array(file, token_iter, expected_count):
             f"streamed {count} array items, expected {expected_count}"
         )
     file.write("]")
-
-
-def _tokens(column):
-    for chunk in column.iter_chunks():
-        for value in chunk:
-            yield repr(value)

From 2e5871602ff4edcd241c2ba1d3f2b4477008e07c Mon Sep 17 00:00:00 2001
From: maurycy <5383+maurycy@users.noreply.github.com>
Date: Thu, 4 Jun 2026 12:01:52 +0200
Subject: [PATCH 7/9] explicit encoding

---
 Lib/profiling/sampling/gecko_collector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 36c47d64298fc1f..06bc02db726408b 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -87,7 +87,7 @@ def flush(self):
         self.buffer.clear()
 
     def iter_tokens(self):
-        with open(self.path) as file:
+        with open(self.path, encoding="utf-8") as file:
             for line in file:
                 yield line.rstrip("\n")
 

From 89695bf6ad7cb3ce21fb8a05f9ca4b7ae26f7d28 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Sat, 6 Jun 2026 02:53:15 +0100
Subject: [PATCH 8/9] fixup! don't be too smart. bye `array`

---
 Lib/profiling/sampling/gecko_collector.py | 209 ++++++++++++----------
 1 file changed, 115 insertions(+), 94 deletions(-)

diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 06bc02db726408b..361f6037f216fdc 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -65,19 +65,24 @@
 
 # In-memory buffer before spilling to disk
 DEFAULT_SPILL_BUFFER_BYTES = 128 * 1024
+_JSON_SEPARATORS = (",", ":")
+_JSON_ENCODER = json.JSONEncoder(
+    separators=_JSON_SEPARATORS, allow_nan=False
+)
 
 
 class SpillColumn:
-    _encoder = json.JSONEncoder(separators=(",", ":"))
-
     def __init__(self, directory, basename, *,
-                 buffer_bytes=DEFAULT_SPILL_BUFFER_BYTES):
+                 buffer_bytes=None):
         self.path = os.path.join(directory, basename)
         self.buffer = bytearray()
-        self._buffer_bytes = buffer_bytes
+        self._buffer_bytes = (
+            DEFAULT_SPILL_BUFFER_BYTES if buffer_bytes is None
+            else buffer_bytes
+        )
 
     def append(self, value):
-        self.buffer += (self._encoder.encode(value) + "\n").encode()
+        self.buffer += (_JSON_ENCODER.encode(value) + "\n").encode("utf-8")
         if len(self.buffer) >= self._buffer_bytes:
             self.flush()
 
@@ -146,7 +151,6 @@ def __init__(self, sample_interval_usec, *, skip_idle=False, opcodes=False):
         # Per-thread data structures
         self.threads = {}  # tid -> thread data
         self.spill_dir = None
-        self.thread_spills = {}
         self.exported = False
 
         # Global tables
@@ -333,7 +337,7 @@ def collect(self, stack_frames, timestamps_us=None):
                 stack_index = self._process_stack(thread_data, frames)
 
                 # Add samples with timestamps
-                thread_spill = self.thread_spills[tid]
+                thread_spill = thread_data["_spill"]
                 for t in times:
                     thread_spill.append_sample(stack_index, t)
 
@@ -365,8 +369,6 @@ def _create_thread(self, tid, is_main_thread):
         if self.spill_dir is None:
             self.spill_dir = tempfile.TemporaryDirectory()
 
-        self.thread_spills[tid] = GeckoThreadSpill(self.spill_dir.name, tid)
-
         thread = {
             "name": f"Thread-{tid}",
             "isMainThread": is_main_thread,
@@ -434,6 +436,7 @@ def _create_thread(self, tid, is_main_thread):
             "_frameCache": {},
             "_funcCache": {},
             "_resourceCache": {},
+            "_spill": GeckoThreadSpill(self.spill_dir.name, tid),
         }
 
         return thread
@@ -461,13 +464,16 @@ def _add_marker(self, tid, name, start_time, end_time, category):
         duration = end_time - start_time
 
         name_idx = self._intern_string(name)
-        self.thread_spills[tid].append_marker(name_idx, start_time, end_time, 1, category, {
-            "type": name.replace(" ", ""),
-            "duration": duration,
-            "tid": tid
-        })
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, category, {
+                "type": name.replace(" ", ""),
+                "duration": duration,
+                "tid": tid,
+            }
+        )
 
-    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset, funcname, start_time, end_time):
+    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset,
+                                    funcname, start_time, end_time):
         """Add an interval marker for opcode execution span."""
         if tid not in self.threads or opcode is None:
             return
@@ -478,17 +484,19 @@ def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset, funcname,
 
         name_idx = self._intern_string(formatted_opname)
 
-        self.thread_spills[tid].append_marker(name_idx, start_time, end_time, 1, CATEGORY_OPCODES, {
-            "type": "Opcode",
-            "opcode": opcode,
-            "opname": formatted_opname,
-            "base_opname": opcode_info["base_opname"],
-            "is_specialized": opcode_info["is_specialized"],
-            "line": lineno,
-            "column": col_offset if col_offset >= 0 else None,
-            "function": funcname,
-            "duration": end_time - start_time,
-        })
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, CATEGORY_OPCODES, {
+                "type": "Opcode",
+                "opcode": opcode,
+                "opname": formatted_opname,
+                "base_opname": opcode_info["base_opname"],
+                "is_specialized": opcode_info["is_specialized"],
+                "line": lineno,
+                "column": col_offset if col_offset >= 0 else None,
+                "function": funcname,
+                "duration": end_time - start_time,
+            }
+        )
 
     def _process_stack(self, thread_data, frames):
         """Process a stack and return the stack index."""
@@ -742,8 +750,7 @@ def spin():
                     os.unlink(temp_path)
                 except FileNotFoundError:
                     pass
-            if self.spill_dir is not None:
-                self.spill_dir.cleanup()
+            self._cleanup_spills()
 
         print(f"Gecko profile written to {filename}")
         print(
@@ -784,8 +791,7 @@ def _build_profile(self):
             return json.loads(file.getvalue())
         finally:
             self.exported = True
-            if self.spill_dir is not None:
-                self.spill_dir.cleanup()
+            self._cleanup_spills()
 
     def _profile_head(self):
         return {
@@ -833,34 +839,38 @@ def _prepare_for_serialization(self):
         if self.exported:
             raise RuntimeError("GeckoCollector has already been exported")
         self._finalize_markers()
-        for spill in self.thread_spills.values():
-            spill.prepare_read()
         for thread_data in self.threads.values():
+            thread_data["_spill"].prepare_read()
             thread_data["stackTable"]["length"] = len(thread_data["stackTable"]["frame"])
             thread_data["frameTable"]["length"] = len(thread_data["frameTable"]["func"])
             thread_data["funcTable"]["length"] = len(thread_data["funcTable"]["name"])
             thread_data["resourceTable"]["length"] = len(thread_data["resourceTable"]["name"])
 
+    def _cleanup_spills(self):
+        if self.spill_dir is not None:
+            self.spill_dir.cleanup()
+            self.spill_dir = None
+
     def _stream_profile(self, file):
-        head = json.dumps(
-            self._profile_head(), separators=(",", ":"), allow_nan=False
-        )[1:-1]
-        tail = json.dumps(
-            self._profile_tail(), separators=(",", ":"), allow_nan=False
-        )[1:-1]
         file.write("{")
-        file.write(head)
-        file.write(',"threads":[')
+        first = True
+        for key, value in self._profile_head().items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "threads", first)
+        file.write("[")
         for index, (tid, thread_data) in enumerate(self.threads.items()):
             if index:
                 file.write(",")
             self._stream_thread(file, tid, thread_data)
-        file.write("],")
-        file.write(tail)
+        file.write("]")
+
+        for key, value in self._profile_tail().items():
+            first = _write_json_member(file, key, value, first)
         file.write("}")
 
     def _stream_thread(self, file, tid, thread_data):
-        spill = self.thread_spills[tid]
+        spill = thread_data["_spill"]
         metadata = {
             "name": thread_data["name"],
             "isMainThread": thread_data["isMainThread"],
@@ -875,8 +885,11 @@ def _stream_thread(self, file, tid, thread_data):
             "processName": thread_data["processName"],
         }
         file.write("{")
-        file.write(json.dumps(metadata, separators=(",", ":"), allow_nan=False)[1:-1])
-        file.write(',"samples":')
+        first = True
+        for key, value in metadata.items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "samples", first)
         self._stream_samples(file, spill)
         for key in (
             "stackTable",
@@ -885,66 +898,74 @@ def _stream_thread(self, file, tid, thread_data):
             "resourceTable",
             "nativeSymbols",
         ):
-            file.write(',"')
-            file.write(key)
-            file.write('":')
-            file.write(json.dumps(
-                thread_data[key], separators=(",", ":"), allow_nan=False
-            ))
-        file.write(',"markers":')
+            first = _write_json_member(file, key, thread_data[key], first)
+        first = _write_member_name(file, "markers", first)
         self._stream_markers(file, spill)
         file.write("}")
 
     def _stream_samples(self, file, spill):
-        file.write('{"stack":')
-        _stream_array(
-            file, spill.samples_stack.iter_tokens(), spill.sample_count
-        )
-        file.write(',"time":')
-        _stream_array(
-            file, spill.samples_time.iter_tokens(), spill.sample_count
-        )
-        file.write(',"eventDelay":')
-        _stream_array(
+        _stream_column_table(
             file,
-            ("null" for _ in range(spill.sample_count)),
+            (
+                ("stack", spill.samples_stack.iter_tokens()),
+                ("time", spill.samples_time.iter_tokens()),
+                ("eventDelay", ("null" for _ in range(spill.sample_count))),
+            ),
             spill.sample_count,
+            (
+                ("weight", None),
+                ("weightType", "samples"),
+                ("length", spill.sample_count),
+            ),
         )
-        file.write(',"weight":null,"weightType":"samples","length":')
-        file.write(repr(spill.sample_count))
-        file.write("}")
 
     def _stream_markers(self, file, spill):
-        file.write('{"data":')
-        _stream_array(
-            file, spill.markers_data.iter_tokens(), spill.marker_count
-        )
-        file.write(',"name":')
-        _stream_array(
-            file, spill.markers_name.iter_tokens(), spill.marker_count
-        )
-        file.write(',"startTime":')
-        _stream_array(
-            file, spill.markers_start_time.iter_tokens(), spill.marker_count
-        )
-        file.write(',"endTime":')
-        _stream_array(
-            file, spill.markers_end_time.iter_tokens(), spill.marker_count
-        )
-        file.write(',"phase":')
-        _stream_array(
-            file, spill.markers_phase.iter_tokens(), spill.marker_count
-        )
-        file.write(',"category":')
-        _stream_array(
-            file, spill.markers_category.iter_tokens(), spill.marker_count
+        _stream_column_table(
+            file,
+            (
+                ("data", spill.markers_data.iter_tokens()),
+                ("name", spill.markers_name.iter_tokens()),
+                ("startTime", spill.markers_start_time.iter_tokens()),
+                ("endTime", spill.markers_end_time.iter_tokens()),
+                ("phase", spill.markers_phase.iter_tokens()),
+                ("category", spill.markers_category.iter_tokens()),
+            ),
+            spill.marker_count,
+            (("length", spill.marker_count),),
         )
-        file.write(',"length":')
-        file.write(repr(spill.marker_count))
-        file.write("}")
 
 
-def _stream_array(file, token_iter, expected_count):
+def _write_json(file, value):
+    for chunk in _JSON_ENCODER.iterencode(value):
+        file.write(chunk)
+
+
+def _write_member_name(file, name, first):
+    if not first:
+        file.write(",")
+    _write_json(file, name)
+    file.write(":")
+    return False
+
+
+def _write_json_member(file, name, value, first):
+    first = _write_member_name(file, name, first)
+    _write_json(file, value)
+    return first
+
+
+def _stream_column_table(file, columns, expected_count, trailing_members=()):
+    file.write("{")
+    first = True
+    for name, token_iter in columns:
+        first = _write_member_name(file, name, first)
+        _stream_array(file, token_iter, expected_count, name)
+    for name, value in trailing_members:
+        first = _write_json_member(file, name, value, first)
+    file.write("}")
+
+
+def _stream_array(file, token_iter, expected_count, label="array"):
     file.write("[")
     count = 0
     for token in token_iter:
@@ -954,6 +975,6 @@ def _stream_array(file, token_iter, expected_count):
         count += 1
     if count != expected_count:
         raise RuntimeError(
-            f"streamed {count} array items, expected {expected_count}"
+            f"streamed {count} {label} items, expected {expected_count}"
         )
     file.write("]")

From 2846e2521b5dbaf7296c9f84160ec2f03da1ecfb Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Sat, 6 Jun 2026 02:53:19 +0100
Subject: [PATCH 9/9] fixup! test

---
 .../test_sampling_profiler/test_collectors.py | 206 ++++++++++++++----
 1 file changed, 168 insertions(+), 38 deletions(-)

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
index 9fe0d8f1bb66ff1..1ab31af67fec522 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -11,6 +11,7 @@
 
 try:
     import _remote_debugging  # noqa: F401
+    from profiling.sampling import gecko_collector
     from profiling.sampling.pstats_collector import PstatsCollector
     from profiling.sampling.stack_collector import (
         CollapsedStackCollector,
@@ -59,6 +60,42 @@ def find_child_by_name(children, strings, substr):
     return None
 
 
+def export_gecko_profile(testcase, collector):
+    gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+    testcase.addCleanup(close_and_unlink, gecko_out)
+    # We cannot overwrite an open file on Windows.
+    gecko_out.close()
+
+    with captured_stdout(), captured_stderr():
+        collector.export(gecko_out.name)
+
+    testcase.assertGreater(os.path.getsize(gecko_out.name), 0)
+    with open(gecko_out.name, encoding="utf-8") as file:
+        return json.load(file)
+
+
+def assert_gecko_column_lengths(testcase, table, columns):
+    expected = table["length"]
+    for column in columns:
+        testcase.assertEqual(
+            len(table[column]), expected,
+            f"{column!r} has wrong length",
+        )
+
+
+def gecko_marker_names(profile, markers):
+    string_array = profile["shared"]["stringArray"]
+    return [string_array[idx] for idx in markers["name"]]
+
+
+def gecko_opcode_marker_data(profile):
+    markers = profile["threads"][0]["markers"]
+    return [
+        data for data in markers["data"]
+        if data.get("type") == "Opcode"
+    ]
+
+
 class TestSampleProfilerComponents(unittest.TestCase):
     """Unit tests for individual profiler components."""
 
@@ -583,9 +620,10 @@ def test_gecko_collector_basic(self):
 
         # Verify samples
         samples = thread_data["samples"]
-        self.assertEqual(len(samples["stack"]), 1)
-        self.assertEqual(len(samples["time"]), 1)
         self.assertEqual(samples["length"], 1)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
 
         # Verify function table structure and content
         func_table = thread_data["funcTable"]
@@ -622,11 +660,6 @@ def test_gecko_collector_basic(self):
     @unittest.skipIf(is_emscripten, "threads not available")
     def test_gecko_collector_export(self):
         """Test Gecko profile export functionality."""
-        gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
-        self.addCleanup(close_and_unlink, gecko_out)
-        # We cannot overwrite an open file on Windows.
-        gecko_out.close()
-
         collector = GeckoCollector(1000)
 
         test_frames1 = [
@@ -659,17 +692,7 @@ def test_gecko_collector_export(self):
         collector.collect(test_frames2)
         collector.collect(test_frames3)
 
-        # Export gecko profile
-        with captured_stdout(), captured_stderr():
-            collector.export(gecko_out.name)
-
-        # Verify file was created and contains valid data
-        self.assertTrue(os.path.exists(gecko_out.name))
-        self.assertGreater(os.path.getsize(gecko_out.name), 0)
-
-        # Check file contains valid JSON
-        with open(gecko_out.name, "r") as f:
-            profile_data = json.load(f)
+        profile_data = export_gecko_profile(self, collector)
 
         # Should be valid Gecko profile format
         self.assertIn("meta", profile_data)
@@ -690,6 +713,100 @@ def test_gecko_collector_export(self):
         self.assertIn("func2", string_array)
         self.assertIn("other_func", string_array)
 
+        thread_data = profile_data["threads"][0]
+        assert_gecko_column_lengths(
+            self, thread_data["samples"], ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_after_spill_flush(self):
+        """Test Gecko profile export after spill buffers flush to disk."""
+        old_buffer_bytes = gecko_collector.DEFAULT_SPILL_BUFFER_BYTES
+        gecko_collector.DEFAULT_SPILL_BUFFER_BYTES = 1
+        self.addCleanup(
+            setattr, gecko_collector, "DEFAULT_SPILL_BUFFER_BYTES",
+            old_buffer_bytes
+        )
+
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames, timestamps_us=[1000, 2000, 3000])
+
+        profile_data = export_gecko_profile(self, collector)
+        samples = profile_data["threads"][0]["samples"]
+        self.assertEqual(samples["length"], 3)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_rejects_collect_after_export(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+        export_gecko_profile(self, collector)
+
+        with self.assertRaisesRegex(RuntimeError, "after export"):
+            collector.collect(test_frames)
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_failure_keeps_existing_file(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            filename = os.path.join(temp_dir, "profile.json")
+            with open(filename, "w", encoding="utf-8") as file:
+                file.write("existing")
+
+            before = set(os.listdir(temp_dir))
+
+            def fail(file):
+                raise OSError("boom")
+
+            collector._stream_profile = fail
+            with captured_stdout(), captured_stderr():
+                with self.assertRaisesRegex(OSError, "boom"):
+                    collector.export(filename)
+
+            with open(filename, encoding="utf-8") as file:
+                self.assertEqual(file.read(), "existing")
+            self.assertEqual(set(os.listdir(temp_dir)), before)
+
     def test_gecko_collector_markers(self):
         """Test Gecko profile markers for GIL and CPU state tracking."""
         collector = GeckoCollector(1000)
@@ -773,21 +890,16 @@ def test_gecko_collector_markers(self):
         self.assertIn("markers", thread_data)
         markers = thread_data["markers"]
 
-        # Should have marker arrays
-        self.assertIn("name", markers)
-        self.assertIn("startTime", markers)
-        self.assertIn("endTime", markers)
-        self.assertIn("category", markers)
         self.assertGreater(
             markers["length"], 0, "Should have generated markers"
         )
-
-        # Get marker names from string table
-        string_array = profile_data["shared"]["stringArray"]
-        marker_names = [string_array[idx] for idx in markers["name"]]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
 
         # Verify we have different marker types
-        marker_name_set = set(marker_names)
+        marker_name_set = set(gecko_marker_names(profile_data, markers))
 
         # Should have "Has GIL" markers (when thread had GIL)
         self.assertIn(
@@ -2661,7 +2773,7 @@ def test_gecko_collector_opcodes_enabled(self):
     def test_gecko_opcode_state_tracking(self):
         """Test that GeckoCollector tracks opcode state changes."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
-        self.addCleanup(lambda: collector.spill_dir.cleanup())
+        self.addCleanup(collector._cleanup_spills)
 
         # First sample with opcode 90 (RAISE_VARARGS)
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2683,7 +2795,6 @@ def test_gecko_opcode_state_tracking(self):
     def test_gecko_opcode_state_change_emits_marker(self):
         """Test that opcode state change emits an interval marker."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
-        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         # First sample: opcode 90
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2706,12 +2817,32 @@ def test_gecko_opcode_state_change_emits_marker(self):
         collector.collect(frames2)
 
         # Should have emitted a marker for the first opcode
-        self.assertGreater(collector.thread_spills[1].marker_count, 0)
+        profile = collector._build_profile()
+        markers = profile["threads"][0]["markers"]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
+        opcode_markers = gecko_opcode_marker_data(profile)
+        self.assertIn(
+            {
+                "opcode": 90,
+                "line": 10,
+                "function": "func",
+            },
+            [
+                {
+                    "opcode": marker["opcode"],
+                    "line": marker["line"],
+                    "function": marker["function"],
+                }
+                for marker in opcode_markers
+            ],
+        )
 
     def test_gecko_opcode_markers_not_emitted_when_disabled(self):
         """Test that no opcode markers when opcodes=False."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=False)
-        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
         frames1 = [
@@ -2731,13 +2862,13 @@ def test_gecko_opcode_markers_not_emitted_when_disabled(self):
         ]
         collector.collect(frames2)
 
-        # opcode_state should not be tracked
-        self.assertEqual(len(collector.opcode_state), 0)
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
+        self.assertEqual(profile["meta"]["markerSchema"], [])
 
     def test_gecko_opcode_with_none_opcode(self):
         """Test that None opcode doesn't cause issues."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
-        self.addCleanup(lambda: collector.spill_dir.cleanup())
 
         # Frame with no opcode (None)
         frame = MockFrameInfo("test.py", 10, "func", opcode=None)
@@ -2749,9 +2880,8 @@ def test_gecko_opcode_with_none_opcode(self):
         ]
         collector.collect(frames)
 
-        # Should track the state but opcode is None
-        self.assertIn(1, collector.opcode_state)
-        self.assertIsNone(collector.opcode_state[1][0])
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
 
 
 class TestCollectorFrameFormat(unittest.TestCase):