Skip to content
Snippets Groups Projects
Commit c74db80b authored by Jan Hartig's avatar Jan Hartig
Browse files

Merge branch 'dev' into 'main'

Merge new features and fixes

See merge request !4
parents 7a8162d9 e3868b24
No related branches found
No related tags found
1 merge request!4Merge new features and fixes
Pipeline #25962 passed
...@@ -18,19 +18,16 @@ def create_app(): ...@@ -18,19 +18,16 @@ def create_app():
with open("localisations.toml", "rb") as file: with open("localisations.toml", "rb") as file:
localisations = tomllib.load(file) localisations = tomllib.load(file)
app.config["LOCALISATIONS"] = localisations
del localisations
with app.app_context():
import routes
for language in app.config["ENABLED_LOCALISATIONS"]: for language in app.config["ENABLED_LOCALISATIONS"]:
check_localisation(app.logger, app.config["LOCALISATIONS"], language) check_localisation(app.logger, localisations, language)
del language app.config["LOCALISATIONS"] = localisations
app.logger.info("Localisations loaded.") app.logger.info("Localisations loaded.")
with app.app_context():
import routes
return app return app
......
# Architecture overview
1. Webserver takes and validates user submitted files
2. Cron job scans files and enqueues new jobs on cluster
3. Job gets processed on the cluster using [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber)
4. Mailservice scans job folders for completed jobs and:
- Sends processed files to users
- Optional: Notifies admins on processing errors
- Optional: Sends monitoring data to webserver
## Job folders
Jobs are given a randomly generated uuid. A job folder looks like this:
```
job_uuid:
- audio.mkv
- video_language.txt
- metadata.json
- statefile (new/done/error)
```
### audio.mkv
Preprocessed input file. Contains only audio data to conserve disk space.
### video_language.txt
Contains the video language tag used for processing with [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber).
Is used by the cronjob script (step 3).
### metadata.json
Used by mailservice (step 4).
```json
{
"email": "example@example.com",
"language": "de",
"video_language": "de",
"filename": "original_filename.original_file_extension"
}
```
### statefile
State is tracked through the following files in the jobs folder:
- new: Job has been submitted by user
- submitted: Job has been scheduled on gpu cluster
- done: Job has been processed without errors
- error: Job has been processed with errors
\ No newline at end of file
# Entries in comments are optional
SECRET_KEY = "your-secret-key" SECRET_KEY = "your-secret-key"
UPLOAD_FOLDER = "uploads" UPLOAD_FOLDER = "uploads"
MAX_CONTENT_LENGTH = 10 # in GB MAX_CONTENT_LENGTH = 10 # in GB
ENABLED_LOCALISATIONS = [ "de", "en" ] ENABLED_LOCALISATIONS = [ "de", "en" ]
DEFAULT_LANGUAGE = "de"
MAIL_DOMAIN = "@example.com"
MAILSERVICE_INTERVAL = 300 # in seconds
# MONITORING_MAIL = "john.smith@example.com"
CONTACT_ORG = "Fun Inc." [ CONTACT ]
CONTACT_NAME = "John Smith" ORG = "Fun Inc."
CONTACT_MAIL = "john.smith@example.com" NAME = "John Smith"
MAIL = "john.smith@example.com"
[ MAIL ] [ MAIL ]
FROM = "funinc@example.com" FROM = "funinc@example.com"
SERVER = "smtp.example.com" SERVER = "smtp.example.com"
PORT = 25 PORT = 25
# LOCAL_HOSTNAME: Set local hostname when talking to SMTP Server # LOCAL_HOSTNAME: Set local hostname when talking to SMTP Server
\ No newline at end of file
#[ METRICS ]
#URL = "http://localhost:8080/telegraf"
#USER = "basic_auth_user"
#PASS = "basic_auth_password"
\ No newline at end of file
...@@ -19,7 +19,7 @@ def validate_audio(_, field): ...@@ -19,7 +19,7 @@ def validate_audio(_, field):
if not has_audio: if not has_audio:
raise ValidationError("noaudiotrack") raise ValidationError("noaudiotrack")
except av.AVError as e: except av.AVError as e:
current_app.logger.error( current_app.logger.info(
"Error while checking audio of file '{}': {}".format(file.filename, str(e)) "Error while checking audio of file '{}': {}".format(file.filename, str(e))
) )
raise ValidationError("brokenfile") raise ValidationError("brokenfile")
...@@ -27,15 +27,15 @@ def validate_audio(_, field): ...@@ -27,15 +27,15 @@ def validate_audio(_, field):
class UploadForm(FlaskForm): class UploadForm(FlaskForm):
email = StringField( email = StringField(
current_app.config["LOCALISATIONS"]["email"]["label"]["de"], current_app.config["LOCALISATIONS"]["email"]["label"][current_app.config["DEFAULT_LANGUAGE"]],
validators=[InputRequired("invalidEmail"), Email("invalidEmail")], validators=[InputRequired("invalidEmail"), Email("invalidEmail")],
) )
language = SelectField( language = SelectField(
current_app.config["LOCALISATIONS"]["language"]["label"]["de"], current_app.config["LOCALISATIONS"]["language"]["label"][current_app.config["DEFAULT_LANGUAGE"]],
validators=[InputRequired("required")], validators=[InputRequired("required")],
) )
file = FileField( file = FileField(
current_app.config["LOCALISATIONS"]["file"]["label"]["de"], current_app.config["LOCALISATIONS"]["file"]["label"][current_app.config["DEFAULT_LANGUAGE"]],
validators=[ validators=[
FileRequired("nofile"), FileRequired("nofile"),
FileSize(current_app.config["MAX_CONTENT_LENGTH"]), FileSize(current_app.config["MAX_CONTENT_LENGTH"]),
......
...@@ -3,8 +3,8 @@ de = "PTB Untertitel Dienst" ...@@ -3,8 +3,8 @@ de = "PTB Untertitel Dienst"
en = "PTB Subtitle Service" en = "PTB Subtitle Service"
[ leadtext ] [ leadtext ]
de = "Dieser Dienst erstellt automatisch Untertitel für Videos,<br>welche z.B. für PTB-Tube genutzt werden können.<br>Die KI-Verarbeitung läuft im HPC-Cluster der PTB." de = "Dieser Dienst erstellt automatisch Untertitel für Videos,<br>welche z.B. für <a class='link-dark link-offset-2 link-offset-3-hover link-underline link-underline-opacity-0 link-underline-opacity-75-hover' href='https://tube.ptb.de'>PTB-Tube</a> genutzt werden können.<br>Die KI-Verarbeitung läuft im HPC-Cluster der PTB."
en = "This service automatically creates subtitles for videos,<br>which can, for example, be used for PTB-Tube.<br>The AI processing runs in PTB's HPC cluster." en = "This service automatically creates subtitles for videos,<br>which can, for example, be used for <a class='link-dark link-offset-2 link-offset-3-hover link-underline link-underline-opacity-0 link-underline-opacity-75-hover' href='https://tube.ptb.de'>PTB-Tube</a>.<br>The AI processing runs in PTB's HPC cluster."
[ successtext] [ successtext]
de = "Daten erfolgreich übermittelt. Ihre Untertitel werden demnächst generiert.<br>Sobald der Prozess abgeschlossen ist, erhalten Sie eine E-Mail." de = "Daten erfolgreich übermittelt. Ihre Untertitel werden demnächst generiert.<br>Sobald der Prozess abgeschlossen ist, erhalten Sie eine E-Mail."
...@@ -87,6 +87,10 @@ en = "Error during processing. File might be corrupt or format is unsupported." ...@@ -87,6 +87,10 @@ en = "Error during processing. File might be corrupt or format is unsupported."
de = "Untertitel erstellen" de = "Untertitel erstellen"
en = "Create subtitle" en = "Create subtitle"
[ return ]
de = "Zurück"
en = "Return"
[ contact.text ] [ contact.text ]
de = "Kontakt" de = "Kontakt"
en = "Contact" en = "Contact"
......
...@@ -5,72 +5,138 @@ import tomllib ...@@ -5,72 +5,138 @@ import tomllib
from email.message import EmailMessage from email.message import EmailMessage
from os import scandir from os import scandir
from pathlib import Path from pathlib import Path
import signal
import time
run = True import requests
from requests.auth import HTTPBasicAuth
def handler(signum, frame):
global run def main(end):
run = False with open("config.toml", "rb") as f:
config = tomllib.load(f)
signal.signal(signal.SIGINT, handler) with open("localisations.toml", "rb") as f:
signal.signal(signal.SIGTERM, handler) localisations = tomllib.load(f)
while not end.is_set():
with open("config.toml", "rb") as f: metrics = {}
config = tomllib.load(f)
# gather jobs
with open("localisations.toml", "rb") as f: completed_jobs = []
localisations = tomllib.load(f) error_jobs = []
with scandir(config["UPLOAD_FOLDER"]) as uploads:
for entry in uploads:
while run: if entry.is_dir():
# gather jobs with scandir(entry.path) as job:
finished_jobs = [] for file in job:
with scandir(config["UPLOAD_FOLDER"]) as uploads: if file.is_file():
for entry in uploads: if file.name == "done":
if entry.is_dir(): completed_jobs.append(entry.path)
with scandir(entry.path) as job: break
for file in job: elif file.name == "error":
if file.is_file() and file.name == "done": error_jobs.append(entry.path)
finished_jobs.append(entry.path) break
break
metrics["total_finished_jobs"] = len(completed_jobs)
# send emails metrics["current_job_errors"] = len(error_jobs)
sent = 0
with smtplib.SMTP( try:
host=config["MAIL"]["SERVER"], local_hostname = config["MAIL"]["LOCAL_HOSTNAME"]
port=config["MAIL"]["PORT"], except KeyError:
local_hostname=config["MAIL"]["LOCAL_HOSTNAME"] if config["MAIL"]["LOCAL_HOSTNAME"] else None, local_hostname = None
) as s:
for job in finished_jobs: s = smtplib.SMTP(
sent += 1 host=config["MAIL"]["SERVER"], port=config["MAIL"]["PORT"], local_hostname=local_hostname
with open(Path(job).joinpath("metadata.json")) as f: )
metadata = json.load(f)
sent = 0
language = metadata["language"] with s:
if len(completed_jobs) > 0:
metrics["completed_job_languages"] = {}
for job in completed_jobs:
sent += 1
with open(Path(job).joinpath("metadata.json")) as f:
metadata = json.load(f)
try:
metrics["completed_job_languages"][metadata["video_language"]] += 1
except KeyError:
metrics["completed_job_languages"][metadata["video_language"]] = 1
language = metadata["language"]
msg = EmailMessage()
msg["Subject"] = localisations["mail"]["subject"][language]
msg["From"] = config["MAIL"]["FROM"]
msg["To"] = metadata["email"]
msg.set_content(
localisations["mail"]["content"][language].format(metadata["filename"])
)
# filename.language.vtt
filename = (
Path(metadata["filename"])
.with_suffix(".{}.vtt".format(metadata["video_language"]))
.name
)
with open(Path(job).joinpath("subtitles.vtt")) as f:
msg.add_attachment(f.read(), filename=filename)
s.send_message(msg)
shutil.rmtree(job)
if len(error_jobs) > 0:
try:
msg = EmailMessage()
msg["Subject"] = "Subtitle Service Error Report"
msg["From"] = config["MAIL"]["FROM"]
msg["To"] = config["MONITORING"]["MAIL"]
job_uuids = []
for job in error_jobs:
job_uuids.append(Path(job).name)
msg.set_content(
"The following jobs currently have errors:\n{}".format("\n - ".join(job_uuids))
)
except KeyError:
pass
try:
try:
auth = HTTPBasicAuth(config["METRICS"]["USER"], config["MONITORING"]["PASS"])
except KeyError:
auth = None
requests.post(config["METRICS"]["URL"], json=metrics, auth=auth)
except KeyError:
pass
print(
"[MAILSERVICE] Sent {} mails. Sleeping for {} seconds.".format(
sent, config["MAILSERVICE_INTERVAL"]
)
)
msg = EmailMessage() end.wait(config["MAILSERVICE_INTERVAL"])
msg["Subject"] = localisations["mail"]["subject"][language]
msg["From"] = config["MAIL"]["FROM"]
msg["To"] = metadata["email"]
msg.set_content(localisations["mail"]["content"][language].format(metadata["filename"]))
# filename.language.vtt if __name__ == "__main__":
filename = ( import signal
Path(metadata["filename"]).with_suffix(".{}.vtt".format(metadata["video_language"])).name from threading import Event
)
with open(Path(job).joinpath("subtitles.vtt")) as f: end = Event()
msg.add_attachment(f.read(), filename=filename)
s.send_message(msg) def handler(signum, frame):
global end
print(signum)
end.set()
shutil.rmtree(job) signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
print("[MAILSERVICE] Sent {} mails. Sleeping for 5 minutes.".format(sent)) main(end)
time.sleep(300)
# Monitoring
## Job errors
The mail service can send out emails for jobs with errors.
Currently, it will send out a summary of all jobs with errors every time it runs.
## Metrics
If configured, the mailservice will export metrics to a webserver by POSTing a summary of the executed jobs and errors as json.
```json
{
"completed_job_languages": {
"de": 3,
"en": 1
},
"total_finished_jobs": 4,
"current_job_errors": 0
}
```
\ No newline at end of file
...@@ -2,4 +2,5 @@ av~=10.0.0 ...@@ -2,4 +2,5 @@ av~=10.0.0
Flask~=2.3.2 Flask~=2.3.2
Flask-WTF~=1.1.1 Flask-WTF~=1.1.1
wtforms[email]~=3.0.1 wtforms[email]~=3.0.1
whitenoise~=6.5.0 whitenoise~=6.5.0
\ No newline at end of file requests
\ No newline at end of file
...@@ -2,10 +2,10 @@ import json ...@@ -2,10 +2,10 @@ import json
from os import path, makedirs from os import path, makedirs
from uuid import uuid4 from uuid import uuid4
import av
from flask import current_app, render_template, abort, url_for, redirect, request, stream_with_context from flask import current_app, render_template, abort, url_for, redirect, request, stream_with_context
from forms import UploadForm from forms import UploadForm
import av
@current_app.route("/upload/<string:language>", methods=["GET", "POST"]) @current_app.route("/upload/<string:language>", methods=["GET", "POST"])
...@@ -18,8 +18,8 @@ def upload(language: str): ...@@ -18,8 +18,8 @@ def upload(language: str):
] ]
if form.is_submitted(): if form.is_submitted():
if form.email.data: if not form.email.data.endswith(current_app.config["MAIL_DOMAIN"]):
form.email.data += "@ptb.de" form.email.data += current_app.config["MAIL_DOMAIN"]
if form.validate(): if form.validate():
...@@ -73,9 +73,9 @@ def upload(language: str): ...@@ -73,9 +73,9 @@ def upload(language: str):
return process_file() return process_file()
else: else:
if form.email.data: if form.email.data:
form.email.data = form.email.data[:-7] form.email.data = form.email.data[:-len(current_app.config["MAIL_DOMAIN"])]
if language != "de": if language != current_app.config["DEFAULT_LANGUAGE"]:
form.email.label.text = current_app.config["LOCALISATIONS"]["email"]["label"][language] form.email.label.text = current_app.config["LOCALISATIONS"]["email"]["label"][language]
form.language.label.text = current_app.config["LOCALISATIONS"]["language"]["label"][language] form.language.label.text = current_app.config["LOCALISATIONS"]["language"]["label"][language]
form.file.label.text = current_app.config["LOCALISATIONS"]["file"]["label"][language] form.file.label.text = current_app.config["LOCALISATIONS"]["file"]["label"][language]
...@@ -86,7 +86,7 @@ def upload(language: str): ...@@ -86,7 +86,7 @@ def upload(language: str):
@current_app.route("/") @current_app.route("/")
@current_app.route("/upload/") @current_app.route("/upload/")
def default(): def default():
return redirect(url_for("upload", language="de")) return redirect(url_for("upload", language=current_app.config["DEFAULT_LANGUAGE"]))
def set_language(language): def set_language(language):
......
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-left" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M15 8a.5.5 0 0 0-.5-.5H2.707l3.147-3.146a.5.5 0 1 0-.708-.708l-4 4a.5.5 0 0 0 0 .708l4 4a.5.5 0 0 0 .708-.708L2.707 8.5H14.5A.5.5 0 0 0 15 8z"/>
</svg>
\ No newline at end of file
File added
No preview for this file type
...@@ -57,7 +57,7 @@ ...@@ -57,7 +57,7 @@
</p> </p>
<br> <br>
<p class="mb-1">{{ config["LOCALISATIONS"]["contact"]["text"][request.language] }}:</p> <p class="mb-1">{{ config["LOCALISATIONS"]["contact"]["text"][request.language] }}:</p>
<p class="mb-1">{{ config["CONTACT_ORG"] }} <a class="link-secondary" href="mailto:{{ config[" CONTACT_MAIL"]}}">{{ config["CONTACT_NAME"] }}</a></p> <p class="mb-1">{{ config["CONTACT"]["ORG"] }} <a class="link-secondary" href="mailto:{{ config["CONTACT"]["MAIL"] }}">{{ config["CONTACT"]["NAME"] }}</a></p>
<p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber">whisper-webvtt-transcriber</a></p> <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber">whisper-webvtt-transcriber</a></p>
</div> </div>
</footer> </footer>
......
{% extends "base.html" %} {% extends "base.html" %}
{% block content %} {% block content %}
<div class="py-5 text-center"> <div class="pt-5 text-center">
<img class="mx-auto mb-4 spinner processing transition-opacity" src="{{ url_for('static', filename='img/arrow-repeat.svg') }}" alt="Checkmark" height="136"> <img class="mx-auto mb-4 spinner processing transition-opacity" src="{{ url_for('static', filename='img/arrow-repeat.svg') }}" alt="Spinning arrow" height="136" aria-hidden="true">
<img class="mx-auto mb-4 success hidden transition-opacity" src="{{ url_for('static', filename='img/cloud-check.svg') }}" alt="Checkmark" height="136" hidden> <img class="mx-auto mb-4 success hidden transition-opacity" src="{{ url_for('static', filename='img/cloud-check.svg') }}" alt="Checkmark" height="136" aria-hidden="true" hidden>
<p class="pt-4 lead processing transition-opacity">{% autoescape false %}{{config["LOCALISATIONS"]["processingtext"][request.language]}}{% endautoescape %}</p> <p class="pt-4 lead processing transition-opacity">{% autoescape false %}{{ config["LOCALISATIONS"]["processingtext"][request.language] }}{% endautoescape %}</p>
<p class="pt-4 lead success hidden transition-opacity" hidden>{% autoescape false %}{{config["LOCALISATIONS"]["successtext"][request.language]}}{% endautoescape %}</p> <p class="pt-4 lead success hidden transition-opacity" hidden>{% autoescape false %}{{ config["LOCALISATIONS"]["successtext"][request.language] }}{% endautoescape %}</p>
<a class="pt-4 fs-1 lead success hidden transition-opacity icon-link icon-link-hover link-offset-2" style="--bs-icon-link-transform: translate3d(-.125rem, 0, 0);" href="{{ url_for('upload', language=request.language) }}">
<img class="bi" aria-label='{{ config["LOCALISATIONS"]["return"][request.language] }}' src="{{ url_for('static', filename='img/arrow-left.svg') }}" alt="Return arrow" height="4em">
</a>
</div> </div>
{% endblock %} {% endblock %}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment