From e6f2c0971da4c7784d907bc4a77400d067087127 Mon Sep 17 00:00:00 2001
From: Jan Hartig <jan.hartig@ptb.de>
Date: Thu, 20 Mar 2025 16:32:13 +0000
Subject: [PATCH] Switch to WhisperX backend

---
 architecture.md     |  4 ++--
 localisations.toml  | 12 ++++++++++--
 mailservice.py      |  2 +-
 requirements.txt    | 10 +++++-----
 routes.py           |  8 +++++---
 templates/base.html |  3 ++-
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/architecture.md b/architecture.md
index b1cf1f4..78269a9 100644
--- a/architecture.md
+++ b/architecture.md
@@ -2,7 +2,7 @@
 
 1. Webserver takes and validates user submitted files
 2. Cron job scans files and enqueues new jobs on cluster
-3. Job gets processed on the cluster using [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber)
+3. Job gets processed on the cluster using [WhisperX](https://github.com/m-bain/whisperX)
 4. Mailservice scans job folders for completed jobs and:
    - Sends processed files to users
    - Optional: Notifies admins on processing errors
@@ -24,7 +24,7 @@ job_uuid:
 Preprocessed input file. Contains only audio data to conserve disk space.
 
 ### video_language.txt
-Contains the video language tag used for processing with [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber).
+Contains the video language tag used for processing with [WhisperX](https://github.com/m-bain/whisperX).
 Is used by the cronjob script (step 3).
 
 ### metadata.json
diff --git a/localisations.toml b/localisations.toml
index 6fa0b6a..856ea0e 100644
--- a/localisations.toml
+++ b/localisations.toml
@@ -40,8 +40,8 @@ de = "Sprache"
 en = "Language"
 
 [ language.helptext ]
-de = "Die gesprochene Sprache der Aufnahme.<br>Bei mehrsprachigen Aufnahmen wählen Sie die Häufigste."
-en = "Spoken language of recording.<br>For multi-language recordings choose the most frequent."
+de = "Die gesprochene Sprache der Aufnahme.<br>Bei mehrsprachigen Aufnahmen wählen Sie die automatische Erkennung."
+en = "Spoken language of recording.<br>For multi-language recordings choose auto-detect."
 
 [ language.choose ]
 de = "Wählen..."
@@ -63,6 +63,14 @@ en = "French"
 de = "Spanisch"
 en = "Spanish"
 
+[ language.options.it ]
+de = "Italienisch"
+en = "Italian"
+
+[ language.options.auto ]
+de = "Automatisch"
+en = "Auto-detect"
+
 [ language.feedback.required ]
 de = "Bitte wählen Sie die Sprache des Videos."
 en = "Please select language of recording."
diff --git a/mailservice.py b/mailservice.py
index e1316ea..12d4cc0 100644
--- a/mailservice.py
+++ b/mailservice.py
@@ -82,7 +82,7 @@ def main(end):
                         .name
                     )
 
-                    with open(Path(job).joinpath("subtitles.vtt")) as f:
+                    with open(Path(job).joinpath("audio.vtt")) as f:
                         msg.add_attachment(f.read(), filename=filename)
 
                     s.send_message(msg)
diff --git a/requirements.txt b/requirements.txt
index 4983494..4404150 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-av~=13.0.0
-Flask~=3.0.0
-Flask-WTF~=1.2.1
-wtforms[email]~=3.1.2
-whitenoise~=6.7.0
+av~=14.0.0
+Flask~=3.1.0
+Flask-WTF~=1.2.2
+wtforms[email]~=3.2.1
+whitenoise~=6.9.0
 requests~=2.32.3
\ No newline at end of file
diff --git a/routes.py b/routes.py
index 5b7f51b..472e880 100644
--- a/routes.py
+++ b/routes.py
@@ -48,7 +48,7 @@ def upload(language: str):
                     audio_stream = [stream for stream in container.streams if stream.type == "audio"][0]
 
                     with av.open(path.join(folder_path, "audio.mkv"), "w") as out:
-                        out_stream = out.add_stream(template=audio_stream)
+                        out_stream = out.add_stream_from_template(audio_stream)
 
                         for packet in container.demux(audio_stream):
                             # Skip the "flushing" packets that `demux` generates.
@@ -60,10 +60,12 @@ def upload(language: str):
 
                             out.mux(packet)
 
+                video_language = "None" if form.language.data == "auto" else form.language.data
+
                 metadata = {
                     "email": form.email.data,
                     "language": language,
-                    "video_language": form.language.data,
+                    "video_language": video_language,
                     "filename": file.filename,
                 }
 
@@ -71,7 +73,7 @@ def upload(language: str):
                     json.dump(metadata, f)
 
                 with open(path.join(folder_path, "video_language.txt"), "w") as f:
-                    f.write("{}".format(form.language.data))
+                    f.write(video_language)
 
                 open(path.join(folder_path, "new"), "wb").close()
 
diff --git a/templates/base.html b/templates/base.html
index a97e354..6d648be 100644
--- a/templates/base.html
+++ b/templates/base.html
@@ -63,7 +63,8 @@
         <br>
         <p class="mb-1">{{ config["LOCALISATIONS"]["contact"]["text"][request.language] }}:</p>
         <p class="mb-1">{{ config["CONTACT"]["ORG"] }} <a class="link-secondary" href="mailto:{{ config["CONTACT"]["MAIL"] }}">{{ config["CONTACT"]["NAME"] }}</a></p>
-        <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber">whisper-webvtt-transcriber</a></p>
+        <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://github.com/m-bain/whisperX">WhisperX</a><br>
+        <a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/ptb-subtitler">ptb-subtitle-service</a></p>
     </div>
 </footer>
 <script src="{{ url_for('static', filename='js/bootstrap.bundle.min.js') }}"></script>
-- 
GitLab