Revert "Implement actual multi gpu support"

This reverts commit 7dcbf223.

Revert "Implement actual multi gpu support"
3d5922ff · Jan Hartig · 7dcbf223 · 3d5922ff · 3d5922ff · 3d5922ff
Commit 3d5922ff authored 2 years ago by Jan Hartig
--- a/.dockerignore
+++ b/.dockerignore
-venv
-test
-README.md
\ No newline at end of file
+venv
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,9 +22,9 @@ COPY --from=downloader /hf /hf
 # install dependencies
 RUN python3 -m pip --no-cache-dir install tqdm

-COPY . .
+COPY main.py .

-VOLUME "/out"
-VOLUME "/inputfile"
+VOLUME "/output/"
+VOLUME "/input"

 ENTRYPOINT ["python3", "main.py"]
\ No newline at end of file
--- a/README.md
+++ b/README.md
 Usage:

 ```shell
-docker build -t whisper-webvtt-transcriber .
+docker build -t whisper-transcriber .

 docker run --rm -it \
-           -v <inputfile>:/inputfile:ro \
-           -v <outputfolder>:/out whisper-transcriber
-```
-
-
-Multi-GPU Note:
-https://github.com/guillaumekln/faster-whisper/issues/100#issuecomment-1492141352
\ No newline at end of file
+           -v <inputfile>:/input/audiofile:ro \
+           -v <output_dir>:/output whisper-transcriber
+```
\ No newline at end of file
--- a/helper.py
+++ b/helper.py
-from faster_whisper import WhisperModel
-from numpy import ndarray
-from typing import List, Tuple
-
-
-def gen_cue(start: float, end: float) -> str:
-    # calculate cue times
-    start_m, start_s = divmod(start, 60)
-    start_h, start_m = divmod(start_m, 60)
-
-    end_m, end_s = divmod(end, 60)
-    end_h, end_m = divmod(end_m, 60)
-
-    return "\n\n{:02.0f}:{:02.0f}:{:06.3f} --> {:02.0f}:{:02.0f}:{:06.3f}\n".format(start_h, start_m, start_s, end_h, end_m, end_s)
-
-
-def gen_segments(model: WhisperModel, audio: ndarray, offset: float) -> List[Tuple[str, str]]:
-    segments, _ = model.transcribe(audio)
-
-    results = []
-    for segment in segments:
-        start = segment.start + offset
-        end = segment.end + offset
-
-        cue = gen_cue(start, end)
-        text = segment.text[1:]
-
-        results.append((cue, text))
-
-    return results
--- a/main.py
+++ b/main.py
@@ -2,83 +2,68 @@ from ctranslate2 import get_cuda_device_count, get_supported_compute_types
 from faster_whisper import WhisperModel
 from time import perf_counter
 from tqdm import tqdm
-from multiGPU import prepare_multi_gpu
-from helper import gen_cue, gen_segments
-from concurrent.futures import ThreadPoolExecutor

 model_size = "large-v2"
 compute_type = "float32"
-preamble = "WEBVTT\n\nNOTE This transcript was automatically generated."
-
-inputfile = "/inputfile"
-outputfile = "/out/transcript.vtt"
-language = "de"

 # Check for GPU presence
-gpus = get_cuda_device_count()
-if gpus < 1:
+nGpus = get_cuda_device_count()
+if nGpus < 1:
    print("No cuda device found!")
    exit(1)

 # Check GPU capabilities
-capable_gpus = []
-for index in range(gpus):
+capableGPUs = []
+for index in range(nGpus):
    supported = get_supported_compute_types("cuda", index)

    if compute_type in supported:
-        capable_gpus.append(index)
+        capableGPUs.append(index)

-if len(capable_gpus) < 1:
+if len(capableGPUs) < 1:
    print("No {} capable cuda device found!".format(compute_type))
    exit(1)

-print("Found {} {} capable GPUs".format(len(capable_gpus), compute_type))
+print("Using {} {} capable GPUs".format(len(capableGPUs), compute_type))

-t_start = perf_counter()
+tStart = perf_counter()

-model = WhisperModel(model_size, device="cuda", device_index=capable_gpus, compute_type="float32", local_files_only=True)
+# Run on GPU with FP32
+model = WhisperModel(model_size, device="cuda", device_index=capableGPUs, compute_type="float32", local_files_only=True)
 print("Model initialized")

-# Check / prepare multi GPU processing
-multi_gpu, audio, duration = prepare_multi_gpu(inputfile, gpus=gpus, sample_rate=model.feature_extractor.sampling_rate)
+# Run on CPU
+# model = WhisperModel(model_size, device="cpu", compute_type="float32")

-print("Start processing...")
-if not multi_gpu:
-    segments, _ = model.transcribe(audio, language=language, beam_size=8)
+segments, info = model.transcribe("/input/audiofile", language="de", beam_size=5)

-    with open(outputfile, "w", encoding="utf-8") as f:
-        f.write(preamble)
+with open("/output/transcript.vtt", "w", encoding="utf-8") as f:
+    f.write("WEBVTT\n\nNOTE This transcript was automatically generated.")

-        with tqdm(total=duration, leave=False) as pbar:
-            previousEnd = 0
-            for segment in segments:
-                cue = gen_cue(segment.start, segment.end)
+    print("Start processing...")
+    with tqdm(total=info.duration, leave=False) as pbar:
+        previousEnd = 0
+        for segment in segments:
+            # calculate cue times
+            startM, startS = divmod(segment.start, 60)
+            startH, startM = divmod(startM, 60)

-                # write cue & text
-                f.write(cue)
-                f.write(segment.text[1:])
+            endM, endS = divmod(segment.end, 60)
+            endH, endM = divmod(endM, 60)

-                # update progressbar
-                pbar.update(segment.end - previousEnd)
-                previousEnd = segment.end
+            # write cue & text
+            f.write("\n\n{:02.0f}:{:02.0f}:{:06.3f} --> {:02.0f}:{:02.0f}:{:06.3f}\n".format(startH, startM, startS, endH, endM, endS))
+            f.write(segment.text[1:])

-else:
-    with ThreadPoolExecutor(max_workers=gpus) as executor:
-        futures = []
-        for offset, part in audio:
-            futures.append(executor.submit(gen_segments, model, part, offset))
+            # update progressbar
+            pbar.update(segment.end - previousEnd)
+            previousEnd = segment.end

-        with open(outputfile, "w", encoding="utf-8") as f:
-            f.write(preamble)
+print("Done!")

-            for future in futures:
-                for cue, text in future.result():
-                    f.write(cue)
-                    f.write(text)
+tDelta = perf_counter() - tStart
+tDeltaM, tDeltaS = divmod(tDelta, 60)

-print("Done!")
+durationM, durationS = divmod(info.duration, 60)

-t_delta = perf_counter() - t_start
-t_delta_m, t_delta_s = divmod(t_delta, 60)
-duration_m, duration_s = divmod(duration, 60)
-print("Processed {:02.0f}m {:02.0f}s audio in {:02.0f}m {:02.0f}s".format(duration_m, duration_s, t_delta_m, t_delta_s))
+print("Processed {:02.0f}m {:02.0f}s audio in {:02.0f}m {:02.0f}s".format(durationM, durationS, tDeltaM, tDeltaS))
--- a/multiGPU.py
+++ b/multiGPU.py
-from faster_whisper.vad import VadOptions, get_speech_timestamps
-from faster_whisper.audio import decode_audio
-from numpy import ndarray
-from typing import Tuple, List, Union, BinaryIO
-
-
-def prepare_multi_gpu(inputfile: Union[str, BinaryIO], gpus: int, sample_rate: int) -> Tuple[int, Union[ndarray, List[Tuple[float, ndarray]]], float]:
-    audio = decode_audio(inputfile, sampling_rate=sample_rate)
-
-    # Get file duration in seconds
-    duration = audio.shape[0] / sample_rate
-
-    if gpus < 2:
-        return False, audio, duration
-
-    # Don't split files under 5 minutes
-    if duration < 300:
-        print("Input duration is under 5 minutes. Falling back to single GPU transcribing.")
-        return False, audio, duration
-
-    print("Trying to find silences to split file into multiple jobs for parallel execution...")
-
-    # Slightly higher than default threshold, 10s minimum silence before splitting
-    vad_parameters = VadOptions(threshold=.6, min_silence_duration_ms=10000)
-    speech_chunks = get_speech_timestamps(audio, vad_parameters)
-
-    # Naively check average duration of parts
-    sum_durations = 0
-    for chunk in speech_chunks:
-        start = chunk["start"] / sample_rate
-        end = chunk["end"] / sample_rate
-
-        sum_durations += end - start
-
-    avg_duration = sum_durations / len(speech_chunks)  # in seconds
-
-    if len(speech_chunks) < 2:
-        print("Could not find enough silences to split file. Falling back to single GPU transcribing.")
-        return False, audio, duration
-
-    if avg_duration < 120:
-        print("Chunks without silence average under two minutes. Falling back to single GPU transcribing.")
-        return False, audio, duration
-
-    print("Splitting file into {} jobs.".format(len(speech_chunks)))
-    jobs = []
-    for chunk in speech_chunks:
-        start = chunk["start"]
-        segment = audio[chunk["start"]:chunk["end"]]
-        jobs.append((start, segment))
-
-    return True, jobs, duration