diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c543e10..ef5fa6c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -25,8 +25,7 @@ jobs:
     uses: actions/setup-python@v2
     with:
      python-version: 3.9
-  - run: pip install -r requirements.txt
-  - run: pip install gdown
+  - run: pip install -r requirements.txt gdown
   - run: gdown 13QpWFWJ37EB-nHrEOY64CEtQWY-tz7DZ
   - run: ./run.py -f=.github/examples/face.jpg -t=.github/examples/target.mp4 -o=.github/examples/output.mp4
   - run: ffmpeg -i .github/examples/snapshot.mp4 -i .github/examples/output.mp4 -filter_complex "psnr" -f null -
diff --git a/roop/core.py b/roop/core.py
index 1759c3b..fb6ea1c 100755
--- a/roop/core.py
+++ b/roop/core.py
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 
+import os
+import sys
+# single thread doubles performance of gpu-mode - needs to be set before torch import
+if any(arg.startswith('--gpu-vendor=') for arg in sys.argv):
+    os.environ['OMP_NUM_THREADS'] = '1'
 import platform
 import signal
-import sys
 import shutil
 import glob
 import argparse
-import os
 import torch
 from pathlib import Path
 from opennsfw2 import predict_video_frames, predict_image
diff --git a/roop/swapper.py b/roop/swapper.py
index 20294cf..9f25b46 100644
--- a/roop/swapper.py
+++ b/roop/swapper.py
@@ -1,10 +1,12 @@
 import os
 from tqdm import tqdm
+import torch
+import onnxruntime
 import cv2
 import insightface
+
 import roop.globals
 from roop.analyser import get_face_single, get_face_many
-import onnxruntime
 
 FACE_SWAPPER = None
 
@@ -29,23 +31,17 @@ def swap_face_in_frame(source_face, target_face, frame):
     return frame
 
 
-def process_faces(source_face, frame, progress):
-    progress_status = 'S'
+def process_faces(source_face, target_frame, progress):
     if roop.globals.all_faces:
-        many_faces = get_face_many(frame)
+        many_faces = get_face_many(target_frame)
         if many_faces:
             for face in many_faces:
-                frame = swap_face_in_frame(source_face, face, frame)
-            progress_status='.'
+                target_frame = swap_face_in_frame(source_face, face, target_frame)
     else:
-        face = get_face_single(frame)
+        face = get_face_single(target_frame)
         if face:
-            frame = swap_face_in_frame(source_face, face, frame)
-            progress_status='.'
-
-    if progress:
-        progress.set_postfix(status=progress_status, refresh=True)
-    return frame
+            target_frame = swap_face_in_frame(source_face, face, target_frame)
+    return target_frame
 
 
 def process_video(source_img, frame_paths, preview_callback):
@@ -54,6 +50,8 @@ def process_video(source_img, frame_paths, preview_callback):
 
     with tqdm(total=len(frame_paths), desc="Processing", unit="frame", dynamic_ncols=True, bar_format=progress_bar_format) as progress:
         for frame_path in frame_paths:
+            if roop.globals.gpu_vendor == 'nvidia':
+                progress.set_postfix(cuda_utilization="{:02d}%".format(torch.cuda.utilization()), cuda_memory="{:02d}GB".format(torch.cuda.memory_usage()))
             frame = cv2.imread(frame_path)
             try:
                 result = process_faces(source_face, frame, progress)
@@ -61,7 +59,6 @@ def process_video(source_img, frame_paths, preview_callback):
                 if preview_callback:
                     preview_callback(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))                
             except Exception:
-                progress.set_postfix(status='E', refresh=True)
                 pass
             progress.update(1)
 
diff --git a/roop/utils.py b/roop/utils.py
index d63807c..3ec6872 100644
--- a/roop/utils.py
+++ b/roop/utils.py
@@ -31,10 +31,8 @@ def detect_fps(input_path):
 
 
 def run_ffmpeg(args):
-
     log_level = f'-loglevel {roop.globals.log_level}'
-
-    os.system(f'ffmpeg {log_level} {args}')
+    run_command(f'ffmpeg {log_level} {args}')
 
 
 def set_fps(input_path, output_path, fps):