From 316cc3b5940152d55929a1e293b329cfcca307d8 Mon Sep 17 00:00:00 2001
From: occlusion <allenmbrower@gmail.com>
Date: Sun, 2 Jul 2023 15:31:24 -0400
Subject: [PATCH 1/5] Added the "stitched_videos" parameter, which allows you
 to generate multiple videos consecutively, then blend them into one video
 post-generation. This is done by looping the render function in
 process_modelscope the amount of times set in the "Stitched Videos" slider in
 the UI. It will pass in the last frame of each previous video to use for
 inpainting frames.

---
 scripts/modelscope/process_modelscope.py | 171 ++++++++++++-----------
 scripts/t2v_helpers/args.py              |  12 +-
 2 files changed, 100 insertions(+), 83 deletions(-)

diff --git a/scripts/modelscope/process_modelscope.py b/scripts/modelscope/process_modelscope.py
index 8c238e1..3ef2c0b 100644
--- a/scripts/modelscope/process_modelscope.py
+++ b/scripts/modelscope/process_modelscope.py
@@ -136,88 +136,95 @@ def process_modelscope(args_dict):
 
     print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
 
-    # Start the batch count loop
-    pbar = tqdm(range(args.batch_count), leave=False)
-    if args.batch_count == 1:
-        pbar.disable = True
-
-    vids_to_pack = []
-
-    state.job_count = args.batch_count
-
-    for batch in pbar:
-        state.job_no = batch + 1
-        if state.skipped:
-            state.skipped = False
-
-        if state.interrupted:
-            break
-
-        shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
-        # TODO: move to a separate function
-        if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
-            keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
-            images = []
-            print("Received an image for inpainting", args.inpainting_image.name)
-            for i in range(args.frames):
-                image = Image.open(args.inpainting_image.name).convert("RGB")
-                image = image.resize((args.width, args.height), Image.ANTIALIAS)
-                array = np.array(image)
-                images += [array]
-
-            images = np.stack(images)  # f h w c
-            batches = 1
-            n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
-            bcfhw = n_images.transpose(0, 4, 1, 2, 3)
-            # convert to 0-1 float
-            bcfhw = bcfhw.astype(np.float32) / 255
-            bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
-
-            print(f"Converted the frames to tensor {bfchw.shape}")
-
-            vd_out = torch.from_numpy(bcfhw).to("cuda")
-
-            # should be -1,1, not 0,1
-            vd_out = 2 * vd_out - 1
-
-            # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
-            # but right now they have shape num_sample=1,4, 1 (only used 1 img), latent_h, latent_w
-            print("Computing latents")
-            image_latents = pipe.compute_latents(vd_out).numpy()
-            # padding_width = [(0, 0), (0, 0), (0, frames-inpainting_frames), (0, 0), (0, 0)]
-            # padded_latents = np.pad(image_latents, pad_width=padding_width, mode='constant', constant_values=0)
-
-            latent_h = args.height // 8
-            latent_w = args.width // 8
-            latent_noise = np.random.normal(size=(1, 4, args.frames, latent_h, latent_w))
-            mask = np.ones(shape=(1, 4, args.frames, latent_h, latent_w))
-
-            mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]
-
-            for i in range(args.frames):
-                v = mask_weights[i]
-                mask[:, :, i, :, :] = v
-
-            masked_latents = image_latents * (1 - mask) + latent_noise * mask
-
-            latents = torch.tensor(masked_latents).to(device)
-
-            mask = torch.tensor(mask).to(device)
-
-            args.strength = 1
-
-        samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
-                                args.width, args.height, args.eta, cpu_vae, device, latents, skip_steps=skip_steps, mask=mask)
-
-        if batch > 0:
-            outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
-        print(f'text2video finished, saving frames to {outdir_current}')
-
-        # just deleted the folder so we need to make it again
-        os.makedirs(outdir_current, exist_ok=True)
-        for i in range(len(samples)):
-            cv2.imwrite(outdir_current + os.path.sep +
-                        f"{i:06}.png", samples[i])
+    print(f'Generating {video_args.stitched_videos + 1} video(s) with {args.frames} frames each')
+
+    for video_number in range(0, video_args.stitched_videos + 1):
+        # Start the batch count loop
+        pbar = tqdm(range(args.batch_count), leave=False)
+        if args.batch_count == 1:
+            pbar.disable = True
+
+        vids_to_pack = []
+
+        state.job_count = args.batch_count
+
+        for batch in pbar:
+            state.job_no = batch + 1
+            if state.skipped:
+                state.skipped = False
+
+            if state.interrupted:
+                break
+
+            shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
+            # TODO: move to a separate function
+            if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
+                keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
+                images = []
+                print("Received an image for inpainting", args.inpainting_image.name)
+                for i in range(args.frames):
+                    image = Image.open(args.inpainting_image.name).convert("RGB")
+                    image = image.resize((args.width, args.height), Image.ANTIALIAS)
+                    array = np.array(image)
+                    images += [array]
+
+                images = np.stack(images)  # f h w c
+                batches = 1
+                n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
+                bcfhw = n_images.transpose(0, 4, 1, 2, 3)
+                # convert to 0-1 float
+                bcfhw = bcfhw.astype(np.float32) / 255
+                bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
+
+                print(f"Converted the frames to tensor {bfchw.shape}")
+
+                vd_out = torch.from_numpy(bcfhw).to("cuda")
+
+                # should be -1,1, not 0,1
+                vd_out = 2 * vd_out - 1
+
+                # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+                # but right now they have shape num_sample=1,4, 1 (only used 1 img), latent_h, latent_w
+                print("Computing latents")
+                image_latents = pipe.compute_latents(vd_out).numpy()
+                # padding_width = [(0, 0), (0, 0), (0, frames-inpainting_frames), (0, 0), (0, 0)]
+                # padded_latents = np.pad(image_latents, pad_width=padding_width, mode='constant', constant_values=0)
+
+                latent_h = args.height // 8
+                latent_w = args.width // 8
+                latent_noise = np.random.normal(size=(1, 4, args.frames, latent_h, latent_w))
+                mask = np.ones(shape=(1, 4, args.frames, latent_h, latent_w))
+
+                mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]
+
+                for i in range(args.frames):
+                    v = mask_weights[i]
+                    mask[:, :, i, :, :] = v
+
+                masked_latents = image_latents * (1 - mask) + latent_noise * mask
+
+                latents = torch.tensor(masked_latents).to(device)
+
+                mask = torch.tensor(mask).to(device)
+
+                args.strength = 1
+
+            samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
+                                    args.width, args.height, args.eta, cpu_vae, device, latents, skip_steps=skip_steps, mask=mask)
+
+            if batch > 0:
+                outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
+            print(f'text2video finished, saving frames to {outdir_current}')
+
+            # just deleted the folder so we need to make it again
+            os.makedirs(outdir_current, exist_ok=True)
+            for i in range(len(samples)):
+                cv2.imwrite(outdir_current + os.path.sep +
+                            f"{(video_number * args.frames) + i:06}.png", samples[i])
+            if video_args.stitched_videos > 0 and video_number < video_args.stitched_videos:
+                continue_num = (args.frames - 1) * (video_number + 1)
+                print(f"Continuing from frame {continue_num}")
+                args.inpainting_image = open(f'{outdir_current + os.path.sep}{continue_num:06}.png', 'rb')
 
         # TODO: add params to the GUI
         if not video_args.skip_video_creation:
diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
index 3c3b13d..73b7fc7 100644
--- a/scripts/t2v_helpers/args.py
+++ b/scripts/t2v_helpers/args.py
@@ -64,6 +64,13 @@ def setup_text2video_settings_dictionary():
         with gr.Tab('txt2vid') as tab_txt2vid:
             # TODO: make it how it's done in Deforum/WebUI, so we won't have to track individual vars
             prompt, n_prompt, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
+            gr.Markdown('''`Stitched videos` allows you to generate multiple videos consecutively and combine them into 
+            one video when they're done. Use inpainting frames and inpainting weights to adjust the transition between 
+            videos.
+            
+            Currently only works with ModelScope''')
+            with gr.Row():
+                stitched_videos = gr.Slider(label="Stitched videos", value=d.stitched_videos, minimum=0, maximum=20, step=1, interactive=True)
             with gr.Accordion('img2vid', open=False):
                 inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
                 # TODO: should be tied to the total frame count dynamically
@@ -127,7 +134,7 @@ def update_max_vid_frames(v2v_frames, sFrame): # Show video
 
     return locals()
 
-t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
+t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, stitched_videos').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
 common_values_names = str('''prompt, n_prompt, steps, frames, seed, cfg_scale, width, height, eta, batch_count''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
@@ -160,6 +167,8 @@ def process_args(args_dict):
         if f'{name}_v' in args_dict:
             args_dict.pop(f'{name}_v')
 
+    print(f'args_dict: {args_dict}')
+
     args = SimpleNamespace(**pack_anim_args(args_dict))
     video_args = SimpleNamespace(**pack_video_args(args_dict))
     T2VArgs_sanity_check(args)
@@ -177,6 +186,7 @@ def T2VArgs():
     prompt = ""
     n_prompt = "text, watermark, copyright, blurry, nsfw"
     strength = 0.75
+    stitched_videos = 0
     vid2vid_startFrame = 0
     inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)
     inpainting_frames = 0

From 1a31ccd3e643bb7079cc9e14f12d887ef613edf2 Mon Sep 17 00:00:00 2001
From: occlusion <allenmbrower@gmail.com>
Date: Wed, 12 Jul 2023 20:11:45 -0400
Subject: [PATCH 2/5] Changed how videos are merged together

---
 requirements.txt                         |   1 +
 scripts/modelscope/process_modelscope.py | 174 +++++++++++++----------
 scripts/t2v_helpers/args.py              |   6 +-
 scripts/t2v_helpers/video_audio_utils.py | 132 ++++++++++++++---
 4 files changed, 215 insertions(+), 98 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b559d24..dd7c46c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ imageio_ffmpeg
 av
 moviepy
 numexpr
+opencv-python
\ No newline at end of file
diff --git a/scripts/modelscope/process_modelscope.py b/scripts/modelscope/process_modelscope.py
index 3e0e49c..1d2e555 100644
--- a/scripts/modelscope/process_modelscope.py
+++ b/scripts/modelscope/process_modelscope.py
@@ -1,5 +1,5 @@
 # Function calls referenced from https://github.com/modelscope/modelscope/tree/master/modelscope/pipelines/multi_modal
-
+import shutil
 # Copyright (C) 2023 by Artem Khrapov (kabachuha)
 # Read LICENSE for usage terms.
 
@@ -17,7 +17,7 @@
 from types import SimpleNamespace
 from t2v_helpers.general_utils import get_t2v_version, get_model_location
 import time, math
-from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
+from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, ffmpeg_reverse_frames, ffmpeg_combine_videos, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
 from t2v_helpers.args import get_outdir, process_args
 import t2v_helpers.args as t2v_helpers_args
 from modules import shared, sd_hijack, lowvram
@@ -70,80 +70,82 @@ def process_modelscope(args_dict):
 
     mask = None
 
-    if args.do_vid2vid:
-        if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
-            raise FileNotFoundError("Please upload a video :()")
+    print(f'Generating {video_args.stitched_videos + 1} video(s) with {args.frames} frames each')
 
-        # Overrides
-        if args.vid2vid_frames is not None:
-            vid2vid_frames_path = args.vid2vid_frames.name
+    for video_number in range(0, video_args.stitched_videos + 1):
+        if args.do_vid2vid:
+            if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
+                raise FileNotFoundError("Please upload a video :()")
 
-        print("got a request to *vid2vid* an existing video.")
+            # Overrides
+            if args.vid2vid_frames is not None:
+                vid2vid_frames_path = args.vid2vid_frames.name
 
-        in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
-        folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
-        outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
-        i = 1
-        while os.path.exists(outdir_no_tmp):
-            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
-            i += 1
+            print("got a request to *vid2vid* an existing video.")
 
-        outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
-        os.makedirs(outdir_v2v, exist_ok=True)
+            in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
+            folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
+            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
+            i = 1
+            while os.path.exists(outdir_no_tmp):
+                outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
+                i += 1
 
-        vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=args.vid2vid_startFrame + args.frames,
-                   numeric_files_output=True, out_img_format='png')
+            outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
+            os.makedirs(outdir_v2v, exist_ok=True)
 
-        temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
-        duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
+            extract_to_frame=args.vid2vid_startFrame + args.frames
+            print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
+            vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
+                       numeric_files_output=True, out_img_format='png')
 
-        videogen = []
-        for f in os.listdir(temp_convert_raw_png_path):
-            # double check for old _depth_ files, not really needed probably but keeping it for now
-            if '_depth_' not in f:
-                videogen.append(f)
+            temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
+            duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
 
-        videogen.sort(key=lambda x: int(x.split('.')[0]))
+            videogen = []
+            for f in os.listdir(temp_convert_raw_png_path):
+                # double check for old _depth_ files, not really needed probably but keeping it for now
+                if '_depth_' not in f:
+                    videogen.append(f)
 
-        images = []
-        for file in tqdm(videogen, desc="Loading frames"):
-            image = Image.open(os.path.join(temp_convert_raw_png_path, file))
-            image = image.resize((args.width, args.height), Image.ANTIALIAS)
-            array = np.array(image)
-            images += [array]
+            videogen.sort(key=lambda x: int(x.split('.')[0]))
 
-        # print(images)
+            images = []
+            for file in tqdm(videogen, desc="Loading frames"):
+                image = Image.open(os.path.join(temp_convert_raw_png_path, file))
+                image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
+                array = np.array(image)
+                images += [array]
 
-        images = np.stack(images)  # f h w c
-        batches = 1
-        n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
-        bcfhw = n_images.transpose(0, 4, 1, 2, 3)
-        # convert to 0-1 float
-        bcfhw = bcfhw.astype(np.float32) / 255
-        bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
+            # print(images)
 
-        print(f"Converted the frames to tensor {bfchw.shape}")
+            images = np.stack(images)  # f h w c
+            batches = 1
+            n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
+            bcfhw = n_images.transpose(0, 4, 1, 2, 3)
+            # convert to 0-1 float
+            bcfhw = bcfhw.astype(np.float32) / 255
+            bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
 
-        vd_out = torch.from_numpy(bcfhw).to("cuda")
+            print(f"Converted the frames to tensor {bfchw.shape}")
 
-        # should be -1,1, not 0,1
-        vd_out = 2 * vd_out - 1
+            vd_out = torch.from_numpy(bcfhw).to("cuda")
 
-        # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
-        print("Computing latents")
-        latents = pipe.compute_latents(vd_out).to(device)
+            # should be -1,1, not 0,1
+            vd_out = 2 * vd_out - 1
 
-        skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
-    else:
-        latents = None
-        args.strength = 1
-        skip_steps = 0
+            # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+            print("Computing latents")
+            latents = pipe.compute_latents(vd_out).to(device)
 
-    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
+            skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
+        else:
+            latents = None
+            args.strength = 1
+            skip_steps = 0
 
-    print(f'Generating {video_args.stitched_videos + 1} video(s) with {args.frames} frames each')
+        print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
 
-    for video_number in range(0, video_args.stitched_videos + 1):
         # Start the batch count loop
         pbar = tqdm(range(args.batch_count), leave=False)
         if args.batch_count == 1:
@@ -169,7 +171,7 @@ def process_modelscope(args_dict):
                 print("Received an image for inpainting", args.inpainting_image.name)
                 for i in range(args.frames):
                     image = Image.open(args.inpainting_image.name).convert("RGB")
-                    image = image.resize((args.width, args.height), Image.ANTIALIAS)
+                    image = image.resize((args.width, args.height))
                     array = np.array(image)
                     images += [array]
 
@@ -218,32 +220,54 @@ def process_modelscope(args_dict):
                                     args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)
 
             if batch > 0:
-                outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
+                outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}", str(video_number))
+            else:
+                outdir_current = os.path.join(get_outdir(), f"{init_timestring}", str(video_number))
             print(f'text2video finished, saving frames to {outdir_current}')
 
+            print(f'I made {len(samples)} samples!')
             # just deleted the folder so we need to make it again
             os.makedirs(outdir_current, exist_ok=True)
             for i in range(len(samples)):
                 cv2.imwrite(outdir_current + os.path.sep +
-                            f"{(video_number * args.frames) + i:06}.png", samples[i])
-            if video_args.stitched_videos > 0 and video_number < video_args.stitched_videos:
-                continue_num = (args.frames - 1) * (video_number + 1)
-                print(f"Continuing from frame {continue_num}")
-                args.inpainting_image = open(f'{outdir_current + os.path.sep}{continue_num:06}.png', 'rb')
-
-        # TODO: add params to the GUI
-        if not video_args.skip_video_creation:
-            ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
-                                                                                                                                                                              "%06d.png"),
-                                stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
-                                audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+                            f"{i:06}.png", samples[i])
+
+            gc.collect()
+            devices.torch_gc()
+
+            # TODO: add params to the GUI
+            if not video_args.skip_video_creation:
+                ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
+                                                                                                                                                                                  "%06d.png"),
+                                    stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
+                                    audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+            if video_args.stitched_videos > 0 and video_number < video_args.stitched_videos and video_args.stitched_video_strength > 0:
+                reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
+                ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+                args.do_vid2vid = True
+                args.vid2vid_startFrame = 0
+                print(f"vid2vid start frame: {args.vid2vid_startFrame}")
+                print(f'strength: {args.strength}')
+                args.strength = video_args.stitched_video_strength
+                args.vid2vid_frames = open(reverse_video_path, 'rb')
         print(f't2v complete, result saved at {outdir_current}')
 
-        mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
-        dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
+    outdir_current = os.path.join(get_outdir(), f"{init_timestring}", "final")
+
+    os.makedirs(outdir_current, exist_ok=True)
+
+    combined_video_path = os.path.join(get_outdir(), f"{init_timestring}", "final")
+    combined_video_list = [os.path.join(get_outdir(), f"{init_timestring}", str(i), "vid.mp4").replace('/', os.path.sep) for i in range(0, video_args.stitched_videos + 1)]
+
+    ffmpeg_combine_videos(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=combined_video_path + os.path.sep + f"vid.mp4",
+                          input_videos=combined_video_list, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset, add_soundtrack=video_args.add_soundtrack,
+                          audio_path=video_args.soundtrack_path)
+
+    mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
+    dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
 
-        if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
-            vids_to_pack.append(dataurl)
+    if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
+        vids_to_pack.append(dataurl)
     t2v_helpers_args.i1_store_t2v = f'<p style=\"font-weight:bold;margin-bottom:0em\">text2video extension for auto1111 — version 1.2b </p>'
     for dataurl in vids_to_pack:
         t2v_helpers_args.i1_store_t2v += f'<video controls loop><source src="{dataurl}" type="video/mp4"></video><br>'
diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
index 6513013..c98d044 100644
--- a/scripts/t2v_helpers/args.py
+++ b/scripts/t2v_helpers/args.py
@@ -107,7 +107,8 @@ def refresh_all_models(model):
 
                         Currently only works with ModelScope''')
             with gr.Row():
-                stitched_videos = gr.Slider(label="Stitched videos", value=d.stitched_videos, minimum=0, maximum=20, step=1, interactive=True)
+                stitched_videos = gr.Slider(label=f"Stitched videos. Total Length: {(d.frames * (d.stitched_videos + 1)) / dv.fps}", value=d.stitched_videos, minimum=0, maximum=200, step=1, interactive=True)
+                stitched_video_strength = gr.Slider(label="Stitched video denoising strength", value=d.stitched_video_strength, minimum=0, maximum=1, step=0.01, interactive=True)
             with gr.Accordion('img2vid', open=False):
                 inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
                 # TODO: should be tied to the total frame count dynamically
@@ -162,7 +163,7 @@ def refresh_all_models(model):
 
     return locals()
 
-t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, stitched_videos').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
+t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, stitched_videos, stitched_video_strength').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
 common_values_names = str('''prompt, n_prompt, sampler, steps, frames, seed, cfg_scale, width, height, eta, batch_count''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
@@ -213,6 +214,7 @@ def T2VArgs():
     n_prompt = "text, watermark, copyright, blurry, nsfw"
     strength = 0.75
     stitched_videos = 0
+    stitched_video_strength = 0.0
     vid2vid_startFrame = 0
     inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)
     inpainting_frames = 0
diff --git a/scripts/t2v_helpers/video_audio_utils.py b/scripts/t2v_helpers/video_audio_utils.py
index c2aba1a..73df730 100644
--- a/scripts/t2v_helpers/video_audio_utils.py
+++ b/scripts/t2v_helpers/video_audio_utils.py
@@ -119,7 +119,7 @@ def find_ffmpeg_binary():
             return files[0] if files else 'ffmpeg'
         except:
             return 'ffmpeg'
-            
+
 # Stitch images to a h264 mp4 video using ffmpeg
 def ffmpeg_stitch_video(ffmpeg_location=None, fps=None, outmp4_path=None, stitch_from_frame=0, stitch_to_frame=None, imgs_path=None, add_soundtrack=None, audio_path=None, crf=17, preset='veryslow'):
     start_time = time.time()
@@ -161,20 +161,56 @@ def ffmpeg_stitch_video(ffmpeg_location=None, fps=None, outmp4_path=None, stitch
         raise Exception(
             f'Error stitching frames to video. Actual runtime error:{e}')
 
+    start_time = time.time()
+
     if add_soundtrack != 'None':
-        audio_add_start_time = time.time()
-        try:
+        ffmpeg_apply_soundtrack(add_soundtrack, audio_path, ffmpeg_location, msg_to_print, outmp4_path, start_time)
+    else:
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        print(f"\rVideo stitching \033[0;32mdone\033[0m in {time.time() - start_time:.2f} seconds!", flush=True)
+
+def ffmpeg_reverse_frames(ffmpeg_location=None, fps=None, outmp4_path=None, stitch_from_frame=0, stitch_to_frame=None, input_path=None, add_soundtrack=None, audio_path=None, crf=17, preset='veryslow'):
+    try:
+        cmd = [
+            ffmpeg_location,
+            '-y',
+            '-i', input_path,
+            '-vf', 'reverse',
+            outmp4_path
+        ]
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+    except FileNotFoundError:
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        raise FileNotFoundError(
+            "FFmpeg not found. Please make sure you have a working ffmpeg path under 'ffmpeg_location' parameter.")
+    except Exception as e:
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        raise Exception(
+            f'Error stitching frames to video. Actual runtime error:{e}')
+
+def ffmpeg_combine_videos(ffmpeg_location=None, fps=None, outmp4_path=None, input_videos=[], add_soundtrack=None, audio_path=None, crf=17, preset='veryslow'):
+    print(f"Got a request to combine videos using FFmpeg.\nVideo count:\n{len(input_videos)}\nTo Video:\n{outmp4_path}")
+    msg_to_print = f"Combining *video*..."
+    print(msg_to_print)
+
+    videos = [f'-i \'{video}\'' for video in input_videos]
+
+    try:
+        #convert each file to a .ts file using ffmpeg
+        for i in range(len(input_videos)):
             cmd = [
                 ffmpeg_location,
-                '-i',
-                outmp4_path,
-                '-i',
-                audio_path,
-                '-map', '0:v',
-                '-map', '1:a',
-                '-c:v', 'copy',
-                '-shortest',
-                outmp4_path+'.temp.mp4'
+                '-y',
+                '-i', input_videos[i],
+                '-c', 'copy',
+                '-bsf:v', 'h264_mp4toannexb',
+                '-f', 'mpegts',
+                f'{input_videos[i]}.ts'
             ]
             process = subprocess.Popen(
                 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -183,20 +219,74 @@ def ffmpeg_stitch_video(ffmpeg_location=None, fps=None, outmp4_path=None, stitch
                 print("\r" + " " * len(msg_to_print), end="", flush=True)
                 print(f"\r{msg_to_print}", flush=True)
                 raise RuntimeError(stderr)
-            os.replace(outmp4_path+'.temp.mp4', outmp4_path)
-            print("\r" + " " * len(msg_to_print), end="", flush=True)
-            print(f"\r{msg_to_print}", flush=True)
-            print(f"\rFFmpeg Video+Audio stitching \033[0;32mdone\033[0m in {time.time() - start_time:.2f} seconds!", flush=True)
-        except Exception as e:
-            print("\r" + " " * len(msg_to_print), end="", flush=True)
-            print(f"\r{msg_to_print}", flush=True)
-            print(f'\rError adding audio to video. Actual error: {e}', flush=True)
-            print(f"FFMPEG Video (sorry, no audio) stitching \033[33mdone\033[0m in {time.time() - start_time:.2f} seconds!", flush=True)
+        # combine all the ts files into one mp4 file
+        cmd = [
+            ffmpeg_location,
+            '-y',
+            '-i', f'concat:{"|".join([f"{video}.ts" for video in input_videos])}',
+            '-c', 'copy',
+            '-bsf:a', 'aac_adtstoasc',
+            outmp4_path
+        ]
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+
+    except FileNotFoundError:
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        raise FileNotFoundError(
+            "FFmpeg not found. Please make sure you have a working ffmpeg path under 'ffmpeg_location' parameter.")
+    except Exception as e:
+        print(str(e))
+        #print("\r" + " " * len(msg_to_print), end="", flush=True)
+        #print(f"\r{msg_to_print}", flush=True)
+        raise Exception(
+            f'Error stitching frames to video. Actual runtime error:{e}')
+
+    start_time = time.time()
+
+    if add_soundtrack != 'None':
+        ffmpeg_apply_soundtrack(add_soundtrack, audio_path, ffmpeg_location, msg_to_print, outmp4_path, start_time)
     else:
         print("\r" + " " * len(msg_to_print), end="", flush=True)
         print(f"\r{msg_to_print}", flush=True)
         print(f"\rVideo stitching \033[0;32mdone\033[0m in {time.time() - start_time:.2f} seconds!", flush=True)
 
+def ffmpeg_apply_soundtrack(add_soundtrack, audio_path, ffmpeg_location, msg_to_print, outmp4_path, start_time):
+    try:
+        cmd = [
+            ffmpeg_location,
+            '-i',
+            outmp4_path,
+            '-i',
+            audio_path,
+            '-map', '0:v',
+            '-map', '1:a',
+            '-c:v', 'copy',
+            '-shortest',
+            outmp4_path + '.temp.mp4'
+        ]
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        if process.returncode != 0:
+            print("\r" + " " * len(msg_to_print), end="", flush=True)
+            print(f"\r{msg_to_print}", flush=True)
+            raise RuntimeError(stderr)
+        os.replace(outmp4_path + '.temp.mp4', outmp4_path)
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        print(f"\rFFmpeg Video+Audio stitching \033[0;32mdone\033[0m in {time.time() - start_time:.2f} seconds!",
+              flush=True)
+    except Exception as e:
+        print("\r" + " " * len(msg_to_print), end="", flush=True)
+        print(f"\r{msg_to_print}", flush=True)
+        print(f'\rError adding audio to video. Actual error: {e}', flush=True)
+        print(
+            f"FFMPEG Video (sorry, no audio) stitching \033[33mdone\033[0m in {time.time() - start_time:.2f} seconds!",
+            flush=True)
+
 # quick-retreive frame count, FPS and H/W dimensions of a video (local or URL-based)
 def get_quick_vid_info(vid_path):
     vidcap = cv2.VideoCapture(vid_path)

From 8b22a3be95406a7e35ae63ba6b58b04e4590498a Mon Sep 17 00:00:00 2001
From: occlusion <allenmbrower@gmail.com>
Date: Wed, 12 Jul 2023 21:55:12 -0400
Subject: [PATCH 3/5] Changed the stitching system to use the existing batch
 parameters

---
 scripts/modelscope/process_modelscope.py | 308 +++++++++++------------
 scripts/t2v_helpers/args.py              |   6 +-
 2 files changed, 157 insertions(+), 157 deletions(-)

diff --git a/scripts/modelscope/process_modelscope.py b/scripts/modelscope/process_modelscope.py
index 1d2e555..cfde66b 100644
--- a/scripts/modelscope/process_modelscope.py
+++ b/scripts/modelscope/process_modelscope.py
@@ -70,55 +70,108 @@ def process_modelscope(args_dict):
 
     mask = None
 
-    print(f'Generating {video_args.stitched_videos + 1} video(s) with {args.frames} frames each')
+    if args.do_vid2vid:
+        if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
+            raise FileNotFoundError("Please upload a video :()")
 
-    for video_number in range(0, video_args.stitched_videos + 1):
-        if args.do_vid2vid:
-            if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
-                raise FileNotFoundError("Please upload a video :()")
+        # Overrides
+        if args.vid2vid_frames is not None:
+            vid2vid_frames_path = args.vid2vid_frames.name
 
-            # Overrides
-            if args.vid2vid_frames is not None:
-                vid2vid_frames_path = args.vid2vid_frames.name
+        print("got a request to *vid2vid* an existing video.")
 
-            print("got a request to *vid2vid* an existing video.")
+        in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
+        folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
+        outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
+        i = 1
+        while os.path.exists(outdir_no_tmp):
+            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
+            i += 1
 
-            in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
-            folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
-            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
-            i = 1
-            while os.path.exists(outdir_no_tmp):
-                outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
-                i += 1
+        outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
+        os.makedirs(outdir_v2v, exist_ok=True)
 
-            outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
-            os.makedirs(outdir_v2v, exist_ok=True)
+        extract_to_frame=args.vid2vid_startFrame + args.frames
+        print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
+        vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
+                   numeric_files_output=True, out_img_format='png')
 
-            extract_to_frame=args.vid2vid_startFrame + args.frames
-            print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
-            vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
-                       numeric_files_output=True, out_img_format='png')
+        temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
+        duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
 
-            temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
-            duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
+        videogen = []
+        for f in os.listdir(temp_convert_raw_png_path):
+            # double check for old _depth_ files, not really needed probably but keeping it for now
+            if '_depth_' not in f:
+                videogen.append(f)
 
-            videogen = []
-            for f in os.listdir(temp_convert_raw_png_path):
-                # double check for old _depth_ files, not really needed probably but keeping it for now
-                if '_depth_' not in f:
-                    videogen.append(f)
+        videogen.sort(key=lambda x: int(x.split('.')[0]))
 
-            videogen.sort(key=lambda x: int(x.split('.')[0]))
+        images = []
+        for file in tqdm(videogen, desc="Loading frames"):
+            image = Image.open(os.path.join(temp_convert_raw_png_path, file))
+            image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
+            array = np.array(image)
+            images += [array]
 
+        # print(images)
+
+        images = np.stack(images)  # f h w c
+        batches = 1
+        n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
+        bcfhw = n_images.transpose(0, 4, 1, 2, 3)
+        # convert to 0-1 float
+        bcfhw = bcfhw.astype(np.float32) / 255
+        bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
+
+        print(f"Converted the frames to tensor {bfchw.shape}")
+
+        vd_out = torch.from_numpy(bcfhw).to("cuda")
+
+        # should be -1,1, not 0,1
+        vd_out = 2 * vd_out - 1
+
+        # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+        print("Computing latents")
+        latents = pipe.compute_latents(vd_out).to(device)
+
+        skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
+    else:
+        latents = None
+        args.strength = 1
+        skip_steps = 0
+
+    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
+
+    # Start the batch count loop
+    pbar = tqdm(range(args.batch_count), leave=False)
+    if args.batch_count == 1:
+        pbar.disable = True
+
+    vids_to_pack = []
+
+    state.job_count = args.batch_count
+
+    for batch in pbar:
+        state.job_no = batch
+        if state.skipped:
+            state.skipped = False
+
+        if state.interrupted:
+            break
+
+        shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
+        # TODO: move to a separate function
+        if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
+            keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
             images = []
-            for file in tqdm(videogen, desc="Loading frames"):
-                image = Image.open(os.path.join(temp_convert_raw_png_path, file))
-                image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
+            print("Received an image for inpainting", args.inpainting_image.name)
+            for i in range(args.frames):
+                image = Image.open(args.inpainting_image.name).convert("RGB")
+                image = image.resize((args.width, args.height))
                 array = np.array(image)
                 images += [array]
 
-            # print(images)
-
             images = np.stack(images)  # f h w c
             batches = 1
             n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
@@ -135,133 +188,80 @@ def process_modelscope(args_dict):
             vd_out = 2 * vd_out - 1
 
             # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+            # but right now they have shape num_sample=1,4, 1 (only used 1 img), latent_h, latent_w
             print("Computing latents")
-            latents = pipe.compute_latents(vd_out).to(device)
+            image_latents = pipe.compute_latents(vd_out).numpy()
+            # padding_width = [(0, 0), (0, 0), (0, frames-inpainting_frames), (0, 0), (0, 0)]
+            # padded_latents = np.pad(image_latents, pad_width=padding_width, mode='constant', constant_values=0)
+
+            latent_h = args.height // 8
+            latent_w = args.width // 8
+            latent_noise = np.random.normal(size=(1, 4, args.frames, latent_h, latent_w))
+            mask = np.ones(shape=(1, 4, args.frames, latent_h, latent_w))
+
+            mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]
+
+            for i in range(args.frames):
+                v = mask_weights[i]
+                mask[:, :, i, :, :] = v
+
+            masked_latents = image_latents * (1 - mask) + latent_noise * mask
+
+            latents = torch.tensor(masked_latents).to(device)
+
+            mask = torch.tensor(mask).to(device)
 
-            skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
-        else:
-            latents = None
             args.strength = 1
-            skip_steps = 0
-
-        print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
-
-        # Start the batch count loop
-        pbar = tqdm(range(args.batch_count), leave=False)
-        if args.batch_count == 1:
-            pbar.disable = True
-
-        vids_to_pack = []
-
-        state.job_count = args.batch_count
-
-        for batch in pbar:
-            state.job_no = batch
-            if state.skipped:
-                state.skipped = False
-
-            if state.interrupted:
-                break
-
-            shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
-            # TODO: move to a separate function
-            if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
-                keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
-                images = []
-                print("Received an image for inpainting", args.inpainting_image.name)
-                for i in range(args.frames):
-                    image = Image.open(args.inpainting_image.name).convert("RGB")
-                    image = image.resize((args.width, args.height))
-                    array = np.array(image)
-                    images += [array]
-
-                images = np.stack(images)  # f h w c
-                batches = 1
-                n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
-                bcfhw = n_images.transpose(0, 4, 1, 2, 3)
-                # convert to 0-1 float
-                bcfhw = bcfhw.astype(np.float32) / 255
-                bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
-
-                print(f"Converted the frames to tensor {bfchw.shape}")
-
-                vd_out = torch.from_numpy(bcfhw).to("cuda")
-
-                # should be -1,1, not 0,1
-                vd_out = 2 * vd_out - 1
-
-                # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
-                # but right now they have shape num_sample=1,4, 1 (only used 1 img), latent_h, latent_w
-                print("Computing latents")
-                image_latents = pipe.compute_latents(vd_out).numpy()
-                # padding_width = [(0, 0), (0, 0), (0, frames-inpainting_frames), (0, 0), (0, 0)]
-                # padded_latents = np.pad(image_latents, pad_width=padding_width, mode='constant', constant_values=0)
-
-                latent_h = args.height // 8
-                latent_w = args.width // 8
-                latent_noise = np.random.normal(size=(1, 4, args.frames, latent_h, latent_w))
-                mask = np.ones(shape=(1, 4, args.frames, latent_h, latent_w))
-
-                mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]
-
-                for i in range(args.frames):
-                    v = mask_weights[i]
-                    mask[:, :, i, :, :] = v
-
-                masked_latents = image_latents * (1 - mask) + latent_noise * mask
-
-                latents = torch.tensor(masked_latents).to(device)
-
-                mask = torch.tensor(mask).to(device)
-
-                args.strength = 1
-
-            samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
-                                    args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)
-
-            if batch > 0:
-                outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}", str(video_number))
-            else:
-                outdir_current = os.path.join(get_outdir(), f"{init_timestring}", str(video_number))
-            print(f'text2video finished, saving frames to {outdir_current}')
-
-            print(f'I made {len(samples)} samples!')
-            # just deleted the folder so we need to make it again
-            os.makedirs(outdir_current, exist_ok=True)
-            for i in range(len(samples)):
-                cv2.imwrite(outdir_current + os.path.sep +
-                            f"{i:06}.png", samples[i])
-
-            gc.collect()
-            devices.torch_gc()
-
-            # TODO: add params to the GUI
-            if not video_args.skip_video_creation:
-                ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
-                                                                                                                                                                                  "%06d.png"),
-                                    stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
-                                    audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
-            if video_args.stitched_videos > 0 and video_number < video_args.stitched_videos and video_args.stitched_video_strength > 0:
-                reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
-                ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
-                args.do_vid2vid = True
-                args.vid2vid_startFrame = 0
-                print(f"vid2vid start frame: {args.vid2vid_startFrame}")
-                print(f'strength: {args.strength}')
-                args.strength = video_args.stitched_video_strength
-                args.vid2vid_frames = open(reverse_video_path, 'rb')
+
+        samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
+                                args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)
+
+        if args.batch_count > 1:
+            outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
+        else:
+            outdir_current = os.path.join(get_outdir(), f"{init_timestring}")
+        print(f'text2video finished, saving frames to {outdir_current}')
+
+        print(f'I made {len(samples)} samples!')
+        # just deleted the folder so we need to make it again
+        os.makedirs(outdir_current, exist_ok=True)
+        for i in range(len(samples)):
+            cv2.imwrite(outdir_current + os.path.sep +
+                        f"{i:06}.png", samples[i])
+
+        gc.collect()
+        devices.torch_gc()
+
+        # TODO: add params to the GUI
+        if not video_args.skip_video_creation:
+            ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
+                                                                                                                                                                              "%06d.png"),
+                                stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
+                                audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+        if video_args.do_stitch_videos and video_args.stitched_video_strength > 0:
+            reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
+            ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+            args.do_vid2vid = True
+            args.vid2vid_startFrame = 0
+            print(f"vid2vid start frame: {args.vid2vid_startFrame}")
+            print(f'strength: {args.strength}')
+            args.strength = video_args.stitched_video_strength
+            args.vid2vid_frames = open(reverse_video_path, 'rb')
         print(f't2v complete, result saved at {outdir_current}')
 
-    outdir_current = os.path.join(get_outdir(), f"{init_timestring}", "final")
+    if video_args.do_stitch_videos:
+        outdir_current = os.path.join(get_outdir(), f"{init_timestring}_final")
 
-    os.makedirs(outdir_current, exist_ok=True)
+        os.makedirs(outdir_current, exist_ok=True)
 
-    combined_video_path = os.path.join(get_outdir(), f"{init_timestring}", "final")
-    combined_video_list = [os.path.join(get_outdir(), f"{init_timestring}", str(i), "vid.mp4").replace('/', os.path.sep) for i in range(0, video_args.stitched_videos + 1)]
+        combined_video_path = os.path.join(get_outdir(), f"{init_timestring}_final")
+        combined_video_list = [os.path.join(get_outdir(), f"{init_timestring}_{i}", "vid.mp4").replace('/', os.path.sep) for i in range(0, args.batch_count)]
 
-    ffmpeg_combine_videos(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=combined_video_path + os.path.sep + f"vid.mp4",
-                          input_videos=combined_video_list, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset, add_soundtrack=video_args.add_soundtrack,
-                          audio_path=video_args.soundtrack_path)
+        ffmpeg_combine_videos(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=combined_video_path + os.path.sep + f"vid.mp4",
+                              input_videos=combined_video_list, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset, add_soundtrack=video_args.add_soundtrack,
+                              audio_path=video_args.soundtrack_path)
+    else:
+        outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
 
     mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
     dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
index c98d044..8548cf7 100644
--- a/scripts/t2v_helpers/args.py
+++ b/scripts/t2v_helpers/args.py
@@ -107,7 +107,7 @@ def refresh_all_models(model):
 
                         Currently only works with ModelScope''')
             with gr.Row():
-                stitched_videos = gr.Slider(label=f"Stitched videos. Total Length: {(d.frames * (d.stitched_videos + 1)) / dv.fps}", value=d.stitched_videos, minimum=0, maximum=200, step=1, interactive=True)
+                do_stitch_videos = gr.Checkbox(label="Stitch videos", value=d.do_stitch_videos, interactive=True)
                 stitched_video_strength = gr.Slider(label="Stitched video denoising strength", value=d.stitched_video_strength, minimum=0, maximum=1, step=0.01, interactive=True)
             with gr.Accordion('img2vid', open=False):
                 inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
@@ -163,7 +163,7 @@ def refresh_all_models(model):
 
     return locals()
 
-t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, stitched_videos, stitched_video_strength').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
+t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, do_stitch_videos, stitched_video_strength').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
 common_values_names = str('''prompt, n_prompt, sampler, steps, frames, seed, cfg_scale, width, height, eta, batch_count''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
@@ -213,7 +213,7 @@ def T2VArgs():
     prompt = ""
     n_prompt = "text, watermark, copyright, blurry, nsfw"
     strength = 0.75
-    stitched_videos = 0
+    do_stitch_videos = False
     stitched_video_strength = 0.0
     vid2vid_startFrame = 0
     inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)

From 83715d7f6748a00a89baf768a3d0bd9663c317a1 Mon Sep 17 00:00:00 2001
From: occlusion <allenmbrower@gmail.com>
Date: Wed, 12 Jul 2023 22:56:47 -0400
Subject: [PATCH 4/5] Wrapped vid2vid in batching so it can be used for
 stitching

---
 requirements.txt                         |   3 +-
 scripts/modelscope/process_modelscope.py | 128 +++++++++++------------
 scripts/t2v_helpers/args.py              |   6 +-
 3 files changed, 68 insertions(+), 69 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index dd7c46c..dfb1833 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 imageio_ffmpeg
 av
 moviepy
-numexpr
-opencv-python
\ No newline at end of file
+numexpr
\ No newline at end of file
diff --git a/scripts/modelscope/process_modelscope.py b/scripts/modelscope/process_modelscope.py
index cfde66b..f5f251b 100644
--- a/scripts/modelscope/process_modelscope.py
+++ b/scripts/modelscope/process_modelscope.py
@@ -70,89 +70,90 @@ def process_modelscope(args_dict):
 
     mask = None
 
-    if args.do_vid2vid:
-        if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
-            raise FileNotFoundError("Please upload a video :()")
+    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
 
-        # Overrides
-        if args.vid2vid_frames is not None:
-            vid2vid_frames_path = args.vid2vid_frames.name
+    # Start the batch count loop
+    pbar = tqdm(range(args.batch_count), leave=False)
+    if args.batch_count == 1:
+        pbar.disable = True
 
-        print("got a request to *vid2vid* an existing video.")
+    vids_to_pack = []
 
-        in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
-        folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
-        outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
-        i = 1
-        while os.path.exists(outdir_no_tmp):
-            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
-            i += 1
+    state.job_count = args.batch_count
 
-        outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
-        os.makedirs(outdir_v2v, exist_ok=True)
+    for batch in pbar:
+        if args.do_vid2vid:
+            if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
+                raise FileNotFoundError("Please upload a video :()")
 
-        extract_to_frame=args.vid2vid_startFrame + args.frames
-        print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
-        vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
-                   numeric_files_output=True, out_img_format='png')
+            # Overrides
+            if args.vid2vid_frames is not None:
+                vid2vid_frames_path = args.vid2vid_frames.name
 
-        temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
-        duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
+            print("got a request to *vid2vid* an existing video.")
 
-        videogen = []
-        for f in os.listdir(temp_convert_raw_png_path):
-            # double check for old _depth_ files, not really needed probably but keeping it for now
-            if '_depth_' not in f:
-                videogen.append(f)
+            in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
+            folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
+            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
+            i = 1
+            while os.path.exists(outdir_no_tmp):
+                outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
+                i += 1
 
-        videogen.sort(key=lambda x: int(x.split('.')[0]))
+            outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
+            os.makedirs(outdir_v2v, exist_ok=True)
 
-        images = []
-        for file in tqdm(videogen, desc="Loading frames"):
-            image = Image.open(os.path.join(temp_convert_raw_png_path, file))
-            image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
-            array = np.array(image)
-            images += [array]
+            extract_to_frame=args.vid2vid_startFrame + args.frames
+            print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
+            vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
+                       numeric_files_output=True, out_img_format='png')
 
-        # print(images)
+            temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
+            duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
 
-        images = np.stack(images)  # f h w c
-        batches = 1
-        n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
-        bcfhw = n_images.transpose(0, 4, 1, 2, 3)
-        # convert to 0-1 float
-        bcfhw = bcfhw.astype(np.float32) / 255
-        bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
+            videogen = []
+            for f in os.listdir(temp_convert_raw_png_path):
+                # double check for old _depth_ files, not really needed probably but keeping it for now
+                if '_depth_' not in f:
+                    videogen.append(f)
 
-        print(f"Converted the frames to tensor {bfchw.shape}")
+            videogen.sort(key=lambda x: int(x.split('.')[0]))
 
-        vd_out = torch.from_numpy(bcfhw).to("cuda")
+            images = []
+            for file in tqdm(videogen, desc="Loading frames"):
+                image = Image.open(os.path.join(temp_convert_raw_png_path, file))
+                image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
+                array = np.array(image)
+                images += [array]
 
-        # should be -1,1, not 0,1
-        vd_out = 2 * vd_out - 1
+            # print(images)
 
-        # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
-        print("Computing latents")
-        latents = pipe.compute_latents(vd_out).to(device)
+            images = np.stack(images)  # f h w c
+            batches = 1
+            n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
+            bcfhw = n_images.transpose(0, 4, 1, 2, 3)
+            # convert to 0-1 float
+            bcfhw = bcfhw.astype(np.float32) / 255
+            bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
 
-        skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
-    else:
-        latents = None
-        args.strength = 1
-        skip_steps = 0
+            print(f"Converted the frames to tensor {bfchw.shape}")
 
-    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
+            vd_out = torch.from_numpy(bcfhw).to("cuda")
 
-    # Start the batch count loop
-    pbar = tqdm(range(args.batch_count), leave=False)
-    if args.batch_count == 1:
-        pbar.disable = True
+            # should be -1,1, not 0,1
+            vd_out = 2 * vd_out - 1
 
-    vids_to_pack = []
+            # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+            print("Computing latents")
+            latents = pipe.compute_latents(vd_out).to(device)
 
-    state.job_count = args.batch_count
+            skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
+        else:
+            latents = None
+            args.strength = 1
+            skip_steps = 0
 
-    for batch in pbar:
+        #do txt2vid
         state.job_no = batch
         if state.skipped:
             state.skipped = False
@@ -222,7 +223,6 @@ def process_modelscope(args_dict):
             outdir_current = os.path.join(get_outdir(), f"{init_timestring}")
         print(f'text2video finished, saving frames to {outdir_current}')
 
-        print(f'I made {len(samples)} samples!')
         # just deleted the folder so we need to make it again
         os.makedirs(outdir_current, exist_ok=True)
         for i in range(len(samples)):
@@ -238,7 +238,7 @@ def process_modelscope(args_dict):
                                                                                                                                                                               "%06d.png"),
                                 stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
                                 audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
-        if video_args.do_stitch_videos and video_args.stitched_video_strength > 0:
+        if video_args.do_stitch_videos and video_args.stitched_video_strength > 0.35:
             reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
             ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
             args.do_vid2vid = True
diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
index 8548cf7..00b46b5 100644
--- a/scripts/t2v_helpers/args.py
+++ b/scripts/t2v_helpers/args.py
@@ -101,9 +101,9 @@ def refresh_all_models(model):
             # TODO: make it how it's done in Deforum/WebUI, so we won't have to track individual vars
             prompt, n_prompt, sampler, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
             model_type.change(fn=enable_sampler_dropdown, inputs=[model_type], outputs=[sampler])
-            gr.Markdown('''`Stitched videos` allows you to generate multiple videos consecutively and combine them into 
-                        one video when they're done. Use inpainting frames and inpainting weights to adjust the transition between 
-                        videos.
+            gr.Markdown('''`Stitch videos` allows you to generate multiple videos consecutively and combine them into 
+                        one video when they're done. Use stitched video denoising strenght to adjust the continuity between videos.
+                        Uses the batch count parameter to determine how many videos to generate and stitch together.
 
                         Currently only works with ModelScope''')
             with gr.Row():

From 4c0db8c7cf8bc7659104f3dc12aa314c439484cb Mon Sep 17 00:00:00 2001
From: occlusion <allenmbrower@gmail.com>
Date: Wed, 12 Jul 2023 22:58:21 -0400
Subject: [PATCH 5/5] Fixed a typo

---
 scripts/t2v_helpers/args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
index 00b46b5..70ccfa7 100644
--- a/scripts/t2v_helpers/args.py
+++ b/scripts/t2v_helpers/args.py
@@ -102,7 +102,7 @@ def refresh_all_models(model):
             prompt, n_prompt, sampler, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
             model_type.change(fn=enable_sampler_dropdown, inputs=[model_type], outputs=[sampler])
             gr.Markdown('''`Stitch videos` allows you to generate multiple videos consecutively and combine them into 
-                        one video when they're done. Use stitched video denoising strenght to adjust the continuity between videos.
+                        one video when they're done. Use stitched video denoising strength to adjust the continuity between videos.
                         Uses the batch count parameter to determine how many videos to generate and stitch together.
 
                         Currently only works with ModelScope''')