From e6f2c0971da4c7784d907bc4a77400d067087127 Mon Sep 17 00:00:00 2001 From: Jan Hartig <jan.hartig@ptb.de> Date: Thu, 20 Mar 2025 16:32:13 +0000 Subject: [PATCH] Switch to WhisperX backend --- architecture.md | 4 ++-- localisations.toml | 12 ++++++++++-- mailservice.py | 2 +- requirements.txt | 10 +++++----- routes.py | 8 +++++--- templates/base.html | 3 ++- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/architecture.md b/architecture.md index b1cf1f4..78269a9 100644 --- a/architecture.md +++ b/architecture.md @@ -2,7 +2,7 @@ 1. Webserver takes and validates user submitted files 2. Cron job scans files and enqueues new jobs on cluster -3. Job gets processed on the cluster using [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber) +3. Job gets processed on the cluster using [WhisperX](https://github.com/m-bain/whisperX) 4. Mailservice scans job folders for completed jobs and: - Sends processed files to users - Optional: Notifies admins on processing errors @@ -24,7 +24,7 @@ job_uuid: Preprocessed input file. Contains only audio data to conserve disk space. ### video_language.txt -Contains the video language tag used for processing with [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber). +Contains the video language tag used for processing with [WhisperX](https://github.com/m-bain/whisperX). Is used by the cronjob script (step 3). ### metadata.json diff --git a/localisations.toml b/localisations.toml index 6fa0b6a..856ea0e 100644 --- a/localisations.toml +++ b/localisations.toml @@ -40,8 +40,8 @@ de = "Sprache" en = "Language" [ language.helptext ] -de = "Die gesprochene Sprache der Aufnahme.<br>Bei mehrsprachigen Aufnahmen wählen Sie die Häufigste." -en = "Spoken language of recording.<br>For multi-language recordings choose the most frequent." +de = "Die gesprochene Sprache der Aufnahme.<br>Bei mehrsprachigen Aufnahmen wählen Sie die automatische Erkennung." +en = "Spoken language of recording.<br>For multi-language recordings choose auto-detect." [ language.choose ] de = "Wählen..." @@ -63,6 +63,14 @@ en = "French" de = "Spanisch" en = "Spanish" +[ language.options.it ] +de = "Italienisch" +en = "Italian" + +[ language.options.auto ] +de = "Automatisch" +en = "Auto-detect" + [ language.feedback.required ] de = "Bitte wählen Sie die Sprache des Videos." en = "Please select language of recording." diff --git a/mailservice.py b/mailservice.py index e1316ea..12d4cc0 100644 --- a/mailservice.py +++ b/mailservice.py @@ -82,7 +82,7 @@ def main(end): .name ) - with open(Path(job).joinpath("subtitles.vtt")) as f: + with open(Path(job).joinpath("audio.vtt")) as f: msg.add_attachment(f.read(), filename=filename) s.send_message(msg) diff --git a/requirements.txt b/requirements.txt index 4983494..4404150 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -av~=13.0.0 -Flask~=3.0.0 -Flask-WTF~=1.2.1 -wtforms[email]~=3.1.2 -whitenoise~=6.7.0 +av~=14.0.0 +Flask~=3.1.0 +Flask-WTF~=1.2.2 +wtforms[email]~=3.2.1 +whitenoise~=6.9.0 requests~=2.32.3 \ No newline at end of file diff --git a/routes.py b/routes.py index 5b7f51b..472e880 100644 --- a/routes.py +++ b/routes.py @@ -48,7 +48,7 @@ def upload(language: str): audio_stream = [stream for stream in container.streams if stream.type == "audio"][0] with av.open(path.join(folder_path, "audio.mkv"), "w") as out: - out_stream = out.add_stream(template=audio_stream) + out_stream = out.add_stream_from_template(audio_stream) for packet in container.demux(audio_stream): # Skip the "flushing" packets that `demux` generates. @@ -60,10 +60,12 @@ def upload(language: str): out.mux(packet) + video_language = "None" if form.language.data == "auto" else form.language.data + metadata = { "email": form.email.data, "language": language, - "video_language": form.language.data, + "video_language": video_language, "filename": file.filename, } @@ -71,7 +73,7 @@ def upload(language: str): json.dump(metadata, f) with open(path.join(folder_path, "video_language.txt"), "w") as f: - f.write("{}".format(form.language.data)) + f.write(video_language) open(path.join(folder_path, "new"), "wb").close() diff --git a/templates/base.html b/templates/base.html index a97e354..6d648be 100644 --- a/templates/base.html +++ b/templates/base.html @@ -63,7 +63,8 @@ <br> <p class="mb-1">{{ config["LOCALISATIONS"]["contact"]["text"][request.language] }}:</p> <p class="mb-1">{{ config["CONTACT"]["ORG"] }} <a class="link-secondary" href="mailto:{{ config["CONTACT"]["MAIL"] }}">{{ config["CONTACT"]["NAME"] }}</a></p> - <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber">whisper-webvtt-transcriber</a></p> + <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://github.com/m-bain/whisperX">WhisperX</a><br> + <a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/ptb-subtitler">ptb-subtitle-service</a></p> </div> </footer> <script src="{{ url_for('static', filename='js/bootstrap.bundle.min.js') }}"></script> -- GitLab