From a9ac178eb1c8800bb7bc53cc04ab2027d7287fba Mon Sep 17 00:00:00 2001
From: Jody Bruchon <jody@jodybruchon.com>
Date: Thu, 19 Dec 2024 11:36:00 -0500
Subject: [PATCH] Use 64 KiB buffered writes for performance and less
 fragmentation

I use yt-dlp on Windows writing to a Linux system via SMB over a
10GbE connection and downloading via 400 Mbps cable internet. I
have observed that downloads often seem to start very fast (40+
MiB/sec) but then throttle down to 8-20 MiB/sec. I also observed
a large amount of disk thrashing for such a large array and small
amount of data that's supposedly being written sequentially.

The problem is two-fold. Downloaded fragments are stored using a
very short-lived *-FragX file, then immediately appended to the
stream upon fragment completion, and deleted. Both operations use
small write buffers. When the OS write buffers start to flush, the
two sets of writes plus the large number of writes start to force
competition to complete the queued writes in different areas of
the volume.

Python defaults to sending writes at the underlying device's
"block size" or a fallback to io.DEFAULT_BUFFER_SIZE. In practical
terms, this means a write buffer of 4096 or 8192 bytes. This
commit increases most write buffers to 65536 (64 KiB) using the
open() buffering=X option, significantly speeding up writes of
larger chunks of data and reducing potential fragmentation in low
disk space conditions. With these changes, I consistently see fast
downloads and the array thrashing is noticeably lessened.
---
 yt_dlp/YoutubeDL.py        | 6 +++---
 yt_dlp/extractor/common.py | 2 +-
 yt_dlp/utils/_utils.py     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index 65b72e026..de7ee2160 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -3298,7 +3298,7 @@ class YoutubeDL:
             else:
                 try:
                     self.to_screen('[info] Writing video annotations to: ' + annofn)
-                    with open(annofn, 'w', encoding='utf-8') as annofile:
+                    with open(annofn, 'w', encoding='utf-8', buffering=65536) as annofile:
                         annofile.write(info_dict['annotations'])
                 except (KeyError, TypeError):
                     self.report_warning('There are no annotations to write.')
@@ -4336,7 +4336,7 @@ class YoutubeDL:
                 try:
                     # Use newline='' to prevent conversion of newline characters
                     # See https://github.com/ytdl-org/youtube-dl/issues/10268
-                    with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile:
+                    with open(sub_filename, 'w', encoding='utf-8', newline='', buffering=65536) as subfile:
                         subfile.write(sub_info['data'])
                     sub_info['filepath'] = sub_filename
                     ret.append((sub_filename, sub_filename_final))
@@ -4399,7 +4399,7 @@ class YoutubeDL:
                 try:
                     uf = self.urlopen(Request(t['url'], headers=t.get('http_headers', {})))
                     self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}')
-                    with open(thumb_filename, 'wb') as thumbf:
+                    with open(thumb_filename, 'wb', buffering=65536) as thumbf:
                         shutil.copyfileobj(uf, thumbf)
                     ret.append((thumb_filename, thumb_filename_final))
                     t['filepath'] = thumb_filename
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 92ddad2b7..1b2c7d98d 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1064,7 +1064,7 @@ class InfoExtractor:
                 data = self._create_request(url_or_request, data).data
             filename = self._request_dump_filename(urlh.url, video_id, data)
             self.to_screen(f'Saving request to {filename}')
-            with open(filename, 'wb') as outf:
+            with open(filename, 'wb', buffering=65536) as outf:
                 outf.write(webpage_bytes)
 
         content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 699bf1e7f..e376e6887 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -187,7 +187,7 @@ def write_json_file(obj, fn):
 
     tf = tempfile.NamedTemporaryFile(
         prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
-        suffix='.tmp', delete=False, mode='w', encoding='utf-8')
+        suffix='.tmp', delete=False, mode='w', encoding='utf-8', buffering=65536)
 
     try:
         with tf:
@@ -603,7 +603,7 @@ def sanitize_open(filename, open_mode):
                     raise LockingUnsupportedError
                 stream = locked_file(filename, open_mode, block=False).__enter__()
             except OSError:
-                stream = open(filename, open_mode)
+                stream = open(filename, open_mode, buffering=65536)
             return stream, filename
         except OSError as err:
             if attempt or err.errno in (errno.EACCES,):