Skip to content
Snippets Groups Projects
Commit 3d5922ff authored by Jan Hartig's avatar Jan Hartig
Browse files

Revert "Implement actual multi gpu support"

This reverts commit 7dcbf223.
parent 7dcbf223
No related branches found
No related tags found
No related merge requests found
venv
test
README.md
\ No newline at end of file
venv
\ No newline at end of file
......@@ -22,9 +22,9 @@ COPY --from=downloader /hf /hf
# install dependencies
RUN python3 -m pip --no-cache-dir install tqdm
COPY . .
COPY main.py .
VOLUME "/out"
VOLUME "/inputfile"
VOLUME "/output/"
VOLUME "/input"
ENTRYPOINT ["python3", "main.py"]
\ No newline at end of file
Usage:
```shell
docker build -t whisper-webvtt-transcriber .
docker build -t whisper-transcriber .
docker run --rm -it \
-v <inputfile>:/inputfile:ro \
-v <outputfolder>:/out whisper-transcriber
```
Multi-GPU Note:
https://github.com/guillaumekln/faster-whisper/issues/100#issuecomment-1492141352
\ No newline at end of file
-v <inputfile>:/input/audiofile:ro \
-v <output_dir>:/output whisper-transcriber
```
\ No newline at end of file
from faster_whisper import WhisperModel
from numpy import ndarray
from typing import List, Tuple
def gen_cue(start: float, end: float) -> str:
# calculate cue times
start_m, start_s = divmod(start, 60)
start_h, start_m = divmod(start_m, 60)
end_m, end_s = divmod(end, 60)
end_h, end_m = divmod(end_m, 60)
return "\n\n{:02.0f}:{:02.0f}:{:06.3f} --> {:02.0f}:{:02.0f}:{:06.3f}\n".format(start_h, start_m, start_s, end_h, end_m, end_s)
def gen_segments(model: WhisperModel, audio: ndarray, offset: float) -> List[Tuple[str, str]]:
segments, _ = model.transcribe(audio)
results = []
for segment in segments:
start = segment.start + offset
end = segment.end + offset
cue = gen_cue(start, end)
text = segment.text[1:]
results.append((cue, text))
return results
......@@ -2,83 +2,68 @@ from ctranslate2 import get_cuda_device_count, get_supported_compute_types
from faster_whisper import WhisperModel
from time import perf_counter
from tqdm import tqdm
from multiGPU import prepare_multi_gpu
from helper import gen_cue, gen_segments
from concurrent.futures import ThreadPoolExecutor
model_size = "large-v2"
compute_type = "float32"
preamble = "WEBVTT\n\nNOTE This transcript was automatically generated."
inputfile = "/inputfile"
outputfile = "/out/transcript.vtt"
language = "de"
# Check for GPU presence
gpus = get_cuda_device_count()
if gpus < 1:
nGpus = get_cuda_device_count()
if nGpus < 1:
print("No cuda device found!")
exit(1)
# Check GPU capabilities
capable_gpus = []
for index in range(gpus):
capableGPUs = []
for index in range(nGpus):
supported = get_supported_compute_types("cuda", index)
if compute_type in supported:
capable_gpus.append(index)
capableGPUs.append(index)
if len(capable_gpus) < 1:
if len(capableGPUs) < 1:
print("No {} capable cuda device found!".format(compute_type))
exit(1)
print("Found {} {} capable GPUs".format(len(capable_gpus), compute_type))
print("Using {} {} capable GPUs".format(len(capableGPUs), compute_type))
t_start = perf_counter()
tStart = perf_counter()
model = WhisperModel(model_size, device="cuda", device_index=capable_gpus, compute_type="float32", local_files_only=True)
# Run on GPU with FP32
model = WhisperModel(model_size, device="cuda", device_index=capableGPUs, compute_type="float32", local_files_only=True)
print("Model initialized")
# Check / prepare multi GPU processing
multi_gpu, audio, duration = prepare_multi_gpu(inputfile, gpus=gpus, sample_rate=model.feature_extractor.sampling_rate)
# Run on CPU
# model = WhisperModel(model_size, device="cpu", compute_type="float32")
print("Start processing...")
if not multi_gpu:
segments, _ = model.transcribe(audio, language=language, beam_size=8)
segments, info = model.transcribe("/input/audiofile", language="de", beam_size=5)
with open(outputfile, "w", encoding="utf-8") as f:
f.write(preamble)
with open("/output/transcript.vtt", "w", encoding="utf-8") as f:
f.write("WEBVTT\n\nNOTE This transcript was automatically generated.")
with tqdm(total=duration, leave=False) as pbar:
previousEnd = 0
for segment in segments:
cue = gen_cue(segment.start, segment.end)
print("Start processing...")
with tqdm(total=info.duration, leave=False) as pbar:
previousEnd = 0
for segment in segments:
# calculate cue times
startM, startS = divmod(segment.start, 60)
startH, startM = divmod(startM, 60)
# write cue & text
f.write(cue)
f.write(segment.text[1:])
endM, endS = divmod(segment.end, 60)
endH, endM = divmod(endM, 60)
# update progressbar
pbar.update(segment.end - previousEnd)
previousEnd = segment.end
# write cue & text
f.write("\n\n{:02.0f}:{:02.0f}:{:06.3f} --> {:02.0f}:{:02.0f}:{:06.3f}\n".format(startH, startM, startS, endH, endM, endS))
f.write(segment.text[1:])
else:
with ThreadPoolExecutor(max_workers=gpus) as executor:
futures = []
for offset, part in audio:
futures.append(executor.submit(gen_segments, model, part, offset))
# update progressbar
pbar.update(segment.end - previousEnd)
previousEnd = segment.end
with open(outputfile, "w", encoding="utf-8") as f:
f.write(preamble)
print("Done!")
for future in futures:
for cue, text in future.result():
f.write(cue)
f.write(text)
tDelta = perf_counter() - tStart
tDeltaM, tDeltaS = divmod(tDelta, 60)
print("Done!")
durationM, durationS = divmod(info.duration, 60)
t_delta = perf_counter() - t_start
t_delta_m, t_delta_s = divmod(t_delta, 60)
duration_m, duration_s = divmod(duration, 60)
print("Processed {:02.0f}m {:02.0f}s audio in {:02.0f}m {:02.0f}s".format(duration_m, duration_s, t_delta_m, t_delta_s))
print("Processed {:02.0f}m {:02.0f}s audio in {:02.0f}m {:02.0f}s".format(durationM, durationS, tDeltaM, tDeltaS))
from faster_whisper.vad import VadOptions, get_speech_timestamps
from faster_whisper.audio import decode_audio
from numpy import ndarray
from typing import Tuple, List, Union, BinaryIO
def prepare_multi_gpu(inputfile: Union[str, BinaryIO], gpus: int, sample_rate: int) -> Tuple[int, Union[ndarray, List[Tuple[float, ndarray]]], float]:
audio = decode_audio(inputfile, sampling_rate=sample_rate)
# Get file duration in seconds
duration = audio.shape[0] / sample_rate
if gpus < 2:
return False, audio, duration
# Don't split files under 5 minutes
if duration < 300:
print("Input duration is under 5 minutes. Falling back to single GPU transcribing.")
return False, audio, duration
print("Trying to find silences to split file into multiple jobs for parallel execution...")
# Slightly higher than default threshold, 10s minimum silence before splitting
vad_parameters = VadOptions(threshold=.6, min_silence_duration_ms=10000)
speech_chunks = get_speech_timestamps(audio, vad_parameters)
# Naively check average duration of parts
sum_durations = 0
for chunk in speech_chunks:
start = chunk["start"] / sample_rate
end = chunk["end"] / sample_rate
sum_durations += end - start
avg_duration = sum_durations / len(speech_chunks) # in seconds
if len(speech_chunks) < 2:
print("Could not find enough silences to split file. Falling back to single GPU transcribing.")
return False, audio, duration
if avg_duration < 120:
print("Chunks without silence average under two minutes. Falling back to single GPU transcribing.")
return False, audio, duration
print("Splitting file into {} jobs.".format(len(speech_chunks)))
jobs = []
for chunk in speech_chunks:
start = chunk["start"]
segment = audio[chunk["start"]:chunk["end"]]
jobs.append((start, segment))
return True, jobs, duration
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment