From 578a145d31311c6c1d46431743c5b5483bcc5d5d Mon Sep 17 00:00:00 2001
From: WyattBlue <wyattblue@auto-editor.com>
Date: Wed, 16 Oct 2024 05:41:01 -0400
Subject: [PATCH] Use PyAV only for rendering video

---
 auto_editor/ffwrapper.py    |  37 -----
 auto_editor/output.py       |  23 ++-
 auto_editor/render/video.py | 308 ++++++++++++++++--------------------
 3 files changed, 161 insertions(+), 207 deletions(-)

diff --git a/auto_editor/ffwrapper.py b/auto_editor/ffwrapper.py
index f57eaabf9..0d06a3baa 100644
--- a/auto_editor/ffwrapper.py
+++ b/auto_editor/ffwrapper.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
-import os.path
 import sys
 from dataclasses import dataclass
 from fractions import Fraction
 from pathlib import Path
-from re import search
 from shutil import which
 from subprocess import PIPE, Popen, run
 from typing import Any
@@ -52,41 +50,6 @@ def run(self, cmd: list[str]) -> None:
             sys.stderr.write(f"{' '.join(cmd)}\n\n")
         run(cmd)
 
-    def run_check_errors(
-        self, cmd: list[str], show_out: bool = False, path: str | None = None
-    ) -> None:
-        process = self.Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
-        _, stderr = process.communicate()
-
-        if process.stdin is not None:
-            process.stdin.close()
-        output = stderr.decode("utf-8", "replace")
-
-        error_list = (
-            r"Unknown encoder '.*'",
-            r"-q:v qscale not available for encoder\. Use -b:v bitrate instead\.",
-            r"Specified sample rate .* is not supported",
-            r'Unable to parse option value ".*"',
-            r"Error setting option .* to value .*\.",
-            r"Undefined constant or missing '.*' in '.*'",
-            r"DLL .* failed to open",
-            r"Incompatible pixel format '.*' for codec '[A-Za-z0-9_]*'",
-            r"Unrecognized option '.*'",
-            r"Permission denied",
-        )
-
-        if self.debug:
-            print(f"stderr: {output}")
-
-        for item in error_list:
-            if check := search(item, output):
-                self.log.error(check.group())
-
-        if path is not None and not os.path.isfile(path):
-            self.log.error(f"The file {path} was not created.")
-        if show_out and not self.debug:
-            print(f"stderr: {output}")
-
     def Popen(
         self, cmd: list[str], stdin: Any = None, stdout: Any = PIPE, stderr: Any = None
     ) -> Popen:
diff --git a/auto_editor/output.py b/auto_editor/output.py
index 9f834adc7..e82a8abda 100644
--- a/auto_editor/output.py
+++ b/auto_editor/output.py
@@ -3,6 +3,8 @@
 import os.path
 from dataclasses import dataclass, field
 from fractions import Fraction
+from re import search
+from subprocess import PIPE
 
 import av
 from av.audio.resampler import AudioResampler
@@ -222,4 +224,23 @@ def mux_quality_media(
         cmd.extend(["-map", "0:d?"])
 
     cmd.append(output_path)
-    ffmpeg.run_check_errors(cmd, path=output_path)
+
+    process = ffmpeg.Popen(cmd, stdout=PIPE, stderr=PIPE)
+    stderr = process.communicate()[1].decode("utf-8", "replace")
+    error_list = (
+        r"Unknown encoder '.*'",
+        r"-q:v qscale not available for encoder\. Use -b:v bitrate instead\.",
+        r"Specified sample rate .* is not supported",
+        r'Unable to parse option value ".*"',
+        r"Error setting option .* to value .*\.",
+        r"DLL .* failed to open",
+        r"Incompatible pixel format '.*' for codec '[A-Za-z0-9_]*'",
+        r"Unrecognized option '.*'",
+        r"Permission denied",
+    )
+    for item in error_list:
+        if check := search(item, stderr):
+            log.error(check.group())
+
+    if not os.path.isfile(output_path):
+        log.error(f"The file {output_path} was not created.")
diff --git a/auto_editor/render/video.py b/auto_editor/render/video.py
index 7cd62bc35..c21d15ff0 100644
--- a/auto_editor/render/video.py
+++ b/auto_editor/render/video.py
@@ -2,14 +2,11 @@
 
 import os.path
 from dataclasses import dataclass
-from subprocess import DEVNULL, PIPE
-from sys import platform
 from typing import TYPE_CHECKING
 
 import av
 import numpy as np
 
-from auto_editor.output import video_quality
 from auto_editor.timeline import TlImage, TlRect, TlVideo
 from auto_editor.utils.encoder import encoders
 from auto_editor.utils.types import color
@@ -99,6 +96,7 @@ def render_av(
     tous: dict[FileInfo, int] = {}
 
     target_pix_fmt = "yuv420p"  # Reasonable default
+    target_fps = tl.tb  # Always constant
     img_cache = make_image_cache(tl)
     temp = log.temp
 
@@ -133,15 +131,29 @@ def render_av(
     log.debug(f"Tous: {tous}")
     log.debug(f"Clips: {tl.v}")
 
-    target_pix_fmt = target_pix_fmt if target_pix_fmt in allowed_pix_fmt else "yuv420p"
-    log.debug(f"Target pix_fmt: {target_pix_fmt}")
-
     apply_video_later = True
     if args.video_codec in encoders:
         apply_video_later = set(encoders[args.video_codec]).isdisjoint(allowed_pix_fmt)
 
     log.debug(f"apply video quality settings now: {not apply_video_later}")
 
+    spedup = os.path.join(temp, "spedup0.mkv")
+    output = av.open(spedup, "w")
+    if apply_video_later:
+        output_stream = output.add_stream("mpeg4", rate=target_fps)
+        target_pix_fmt = "yuv420p"
+    else:
+        _temp = output.add_stream(
+            args.video_codec, rate=target_fps, options={"mov_flags": "faststart"}
+        )
+        if not isinstance(_temp, av.VideoStream):
+            log.error(f"Not a known video codec: {args.video_codec}")
+        output_stream = _temp
+        target_pix_fmt = (
+            target_pix_fmt if target_pix_fmt in allowed_pix_fmt else "yuv420p"
+        )
+        # TODO: apply `-b:v`, `qscale:v`
+
     if args.scale == 1.0:
         target_width, target_height = tl.res
         scale_graph = None
@@ -157,44 +169,14 @@ def render_av(
             scale_graph.add("buffersink"),
         )
 
-    spedup = os.path.join(temp, "spedup0.mp4")
-
-    cmd = [
-        "-hide_banner",
-        "-y",
-        "-f",
-        "rawvideo",
-        "-c:v",
-        "rawvideo",
-        "-pix_fmt",
-        target_pix_fmt,
-        "-s",
-        f"{target_width}*{target_height}",
-        "-framerate",
-        f"{tl.tb}",
-        "-i",
-        "-",
-        "-pix_fmt",
-        target_pix_fmt,
-    ]
-
-    if platform == "darwin":
-        # Fix videotoolbox issue with legacy macs
-        cmd += ["-allow_sw", "1"]
+    output_stream.width = target_width
+    output_stream.height = target_height
+    output_stream.pix_fmt = target_pix_fmt
 
-    if apply_video_later:
-        cmd += ["-c:v", "mpeg4", "-qscale:v", "1"]
-    else:
-        cmd += video_quality(args)
-
-    # Setting SAR requires re-encoding so we do it here.
     if src is not None and src.videos and (sar := src.videos[0].sar) is not None:
-        cmd.extend(["-vf", f"setsar={sar}"])
-
-    cmd.append(spedup)
+        output_stream.sample_aspect_ratio = sar
 
-    process2 = ffmpeg.Popen(cmd, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL)
-    assert process2.stdin is not None
+    from_ndarray = av.VideoFrame.from_ndarray
 
     # First few frames can have an abnormal keyframe count, so never seek there.
     seek = 10
@@ -206,142 +188,130 @@ def render_av(
     bg = color(args.background)
     null_frame = make_solid(target_width, target_height, target_pix_fmt, bg)
     frame_index = -1
-    try:
-        for index in range(tl.end):
-            obj_list: list[VideoFrame | TlRect | TlImage] = []
-            for layer in tl.v:
-                for lobj in layer:
-                    if isinstance(lobj, TlVideo):
-                        if index >= lobj.start and index < (lobj.start + lobj.dur):
-                            _i = round((lobj.offset + index - lobj.start) * lobj.speed)
-                            obj_list.append(VideoFrame(_i, lobj.src))
-                    elif index >= lobj.start and index < lobj.start + lobj.dur:
-                        obj_list.append(lobj)
-
-            frame = null_frame
-            for obj in obj_list:
-                if isinstance(obj, VideoFrame):
-                    my_stream = cns[obj.src].streams.video[0]
-                    if frame_index > obj.index:
-                        log.debug(f"Seek: {frame_index} -> 0")
-                        cns[obj.src].seek(0)
-                        try:
-                            frame = next(decoders[obj.src])
-                            frame_index = round(frame.time * tl.tb)
-                        except StopIteration:
-                            pass
-
-                    while frame_index < obj.index:
-                        # Check if skipping ahead is worth it.
-                        if (
-                            obj.index - frame_index > seek_cost[obj.src]
-                            and frame_index > seek
-                        ):
-                            seek = frame_index + (seek_cost[obj.src] // 2)
-                            seek_frame = frame_index
-                            log.debug(f"Seek: {frame_index} -> {obj.index}")
-                            cns[obj.src].seek(
-                                obj.index * tous[obj.src],
-                                stream=my_stream,
-                            )
-
-                        try:
-                            frame = next(decoders[obj.src])
-                            frame_index = round(frame.time * tl.tb)
-                        except StopIteration:
-                            log.debug(f"No source frame at {index=}. Using null frame")
-                            frame = null_frame
-                            break
-
-                        if seek_frame is not None:
-                            log.debug(
-                                f"Skipped {frame_index - seek_frame} frame indexes"
-                            )
-                            frames_saved += frame_index - seek_frame
-                            seek_frame = None
-                        if frame.key_frame:
-                            log.debug(f"Keyframe {frame_index} {frame.pts}")
-
-                    if (frame.width, frame.height) != tl.res:
-                        width, height = tl.res
-                        graph = av.filter.Graph()
-                        graph.link_nodes(
-                            graph.add_buffer(template=my_stream),
-                            graph.add(
-                                "scale",
-                                f"{width}:{height}:force_original_aspect_ratio=decrease:eval=frame",
-                            ),
-                            graph.add("pad", f"{width}:{height}:-1:-1:color={bg}"),
-                            graph.add("buffersink"),
-                        ).vpush(frame)
-                        frame = graph.vpull()
-                elif isinstance(obj, TlRect):
+
+    for index in range(tl.end):
+        obj_list: list[VideoFrame | TlRect | TlImage] = []
+        for layer in tl.v:
+            for lobj in layer:
+                if isinstance(lobj, TlVideo):
+                    if index >= lobj.start and index < (lobj.start + lobj.dur):
+                        _i = round((lobj.offset + index - lobj.start) * lobj.speed)
+                        obj_list.append(VideoFrame(_i, lobj.src))
+                elif index >= lobj.start and index < lobj.start + lobj.dur:
+                    obj_list.append(lobj)
+
+        frame = null_frame
+        for obj in obj_list:
+            if isinstance(obj, VideoFrame):
+                my_stream = cns[obj.src].streams.video[0]
+                if frame_index > obj.index:
+                    log.debug(f"Seek: {frame_index} -> 0")
+                    cns[obj.src].seek(0)
+                    try:
+                        frame = next(decoders[obj.src])
+                        frame_index = round(frame.time * tl.tb)
+                    except StopIteration:
+                        pass
+
+                while frame_index < obj.index:
+                    # Check if skipping ahead is worth it.
+                    if (
+                        obj.index - frame_index > seek_cost[obj.src]
+                        and frame_index > seek
+                    ):
+                        seek = frame_index + (seek_cost[obj.src] // 2)
+                        seek_frame = frame_index
+                        log.debug(f"Seek: {frame_index} -> {obj.index}")
+                        cns[obj.src].seek(obj.index * tous[obj.src], stream=my_stream)
+
+                    try:
+                        frame = next(decoders[obj.src])
+                        frame_index = round(frame.time * tl.tb)
+                    except StopIteration:
+                        log.debug(f"No source frame at {index=}. Using null frame")
+                        frame = null_frame
+                        break
+
+                    if seek_frame is not None:
+                        log.debug(f"Skipped {frame_index - seek_frame} frame indexes")
+                        frames_saved += frame_index - seek_frame
+                        seek_frame = None
+                    if frame.key_frame:
+                        log.debug(f"Keyframe {frame_index} {frame.pts}")
+
+                if (frame.width, frame.height) != tl.res:
+                    width, height = tl.res
                     graph = av.filter.Graph()
-                    x, y = obj.x, obj.y
                     graph.link_nodes(
                         graph.add_buffer(template=my_stream),
                         graph.add(
-                            "drawbox",
-                            f"x={x}:y={y}:w={obj.width}:h={obj.height}:color={obj.fill}:t=fill",
+                            "scale",
+                            f"{width}:{height}:force_original_aspect_ratio=decrease:eval=frame",
                         ),
+                        graph.add("pad", f"{width}:{height}:-1:-1:color={bg}"),
                         graph.add("buffersink"),
                     ).vpush(frame)
                     frame = graph.vpull()
-                elif isinstance(obj, TlImage):
-                    img = img_cache[(obj.src, obj.width)]
-                    array = frame.to_ndarray(format="rgb24")
-
-                    overlay_h, overlay_w, _ = img.shape
-                    x_pos, y_pos = obj.x, obj.y
-
-                    x_start = max(x_pos, 0)
-                    y_start = max(y_pos, 0)
-                    x_end = min(x_pos + overlay_w, frame.width)
-                    y_end = min(y_pos + overlay_h, frame.height)
-
-                    # Clip the overlay image to fit into the frame
-                    overlay_x_start = max(-x_pos, 0)
-                    overlay_y_start = max(-y_pos, 0)
-                    overlay_x_end = overlay_w - max(
-                        (x_pos + overlay_w) - frame.width, 0
-                    )
-                    overlay_y_end = overlay_h - max(
-                        (y_pos + overlay_h) - frame.height, 0
-                    )
-                    clipped_overlay = img[
-                        overlay_y_start:overlay_y_end, overlay_x_start:overlay_x_end
-                    ]
-
-                    # Create a region of interest (ROI) on the video frame
-                    roi = array[y_start:y_end, x_start:x_end]
-
-                    # Blend the overlay image with the ROI based on the opacity
-                    roi = (1 - obj.opacity) * roi + obj.opacity * clipped_overlay
-                    array[y_start:y_end, x_start:x_end] = roi
-                    array = np.clip(array, 0, 255).astype(np.uint8)
-
-                    frame = av.VideoFrame.from_ndarray(array, format="rgb24")
-
-            if scale_graph is not None and frame.width != target_width:
-                scale_graph.vpush(frame)
-                frame = scale_graph.vpull()
-
-            if frame.format.name != target_pix_fmt:
-                frame = frame.reformat(format=target_pix_fmt)
-                bar.tick(index)
-            elif index % 3 == 0:
-                bar.tick(index)
-
-            process2.stdin.write(frame.to_ndarray().tobytes())
-
-        bar.end()
-        process2.stdin.close()
-        process2.wait()
-    except (OSError, BrokenPipeError):
-        bar.end()
-        ffmpeg.run_check_errors(cmd, True)
-        log.error("FFmpeg Error!")
-
+            elif isinstance(obj, TlRect):
+                graph = av.filter.Graph()
+                x, y = obj.x, obj.y
+                graph.link_nodes(
+                    graph.add_buffer(template=my_stream),
+                    graph.add(
+                        "drawbox",
+                        f"x={x}:y={y}:w={obj.width}:h={obj.height}:color={obj.fill}:t=fill",
+                    ),
+                    graph.add("buffersink"),
+                ).vpush(frame)
+                frame = graph.vpull()
+            elif isinstance(obj, TlImage):
+                img = img_cache[(obj.src, obj.width)]
+                array = frame.to_ndarray(format="rgb24")
+
+                overlay_h, overlay_w, _ = img.shape
+                x_pos, y_pos = obj.x, obj.y
+
+                x_start = max(x_pos, 0)
+                y_start = max(y_pos, 0)
+                x_end = min(x_pos + overlay_w, frame.width)
+                y_end = min(y_pos + overlay_h, frame.height)
+
+                # Clip the overlay image to fit into the frame
+                overlay_x_start = max(-x_pos, 0)
+                overlay_y_start = max(-y_pos, 0)
+                overlay_x_end = overlay_w - max((x_pos + overlay_w) - frame.width, 0)
+                overlay_y_end = overlay_h - max((y_pos + overlay_h) - frame.height, 0)
+                clipped_overlay = img[
+                    overlay_y_start:overlay_y_end, overlay_x_start:overlay_x_end
+                ]
+
+                # Create a region of interest (ROI) on the video frame
+                roi = array[y_start:y_end, x_start:x_end]
+
+                # Blend the overlay image with the ROI based on the opacity
+                roi = (1 - obj.opacity) * roi + obj.opacity * clipped_overlay
+                array[y_start:y_end, x_start:x_end] = roi
+                array = np.clip(array, 0, 255).astype(np.uint8)
+
+                frame = from_ndarray(array, format="rgb24")
+
+        if scale_graph is not None and frame.width != target_width:
+            scale_graph.vpush(frame)
+            frame = scale_graph.vpull()
+
+        if frame.format.name != target_pix_fmt:
+            frame = frame.reformat(format=target_pix_fmt)
+            bar.tick(index)
+        elif index % 3 == 0:
+            bar.tick(index)
+
+        new_frame = from_ndarray(frame.to_ndarray(), format=frame.format.name)
+        output.mux(output_stream.encode(new_frame))
+
+    bar.end()
+
+    output.mux(output_stream.encode(None))
+    output.close()
     log.debug(f"Total frames saved seeking: {frames_saved}")
 
     return spedup, apply_video_later