diff --git a/app.py b/app.py index 9e22b513de9538577a41420826c41971b81272fa..b2d552cc731eeb9a882f3c2a839772a048bed9cf 100644 --- a/app.py +++ b/app.py @@ -18,19 +18,16 @@ def create_app(): with open("localisations.toml", "rb") as file: localisations = tomllib.load(file) - app.config["LOCALISATIONS"] = localisations - del localisations - - with app.app_context(): - import routes - for language in app.config["ENABLED_LOCALISATIONS"]: - check_localisation(app.logger, app.config["LOCALISATIONS"], language) + check_localisation(app.logger, localisations, language) - del language + app.config["LOCALISATIONS"] = localisations app.logger.info("Localisations loaded.") + with app.app_context(): + import routes + return app diff --git a/architecture.md b/architecture.md new file mode 100644 index 0000000000000000000000000000000000000000..b1cf1f43f4f33c322f32bb5581072cf03d91dafa --- /dev/null +++ b/architecture.md @@ -0,0 +1,46 @@ +# Architecture overview + +1. Webserver takes and validates user submitted files +2. Cron job scans files and enqueues new jobs on cluster +3. Job gets processed on the cluster using [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber) +4. Mailservice scans job folders for completed jobs and: + - Sends processed files to users + - Optional: Notifies admins on processing errors + - Optional: Sends monitoring data to webserver + + +## Job folders +Jobs are given a randomly generated uuid. A job folder looks like this: + +``` +job_uuid: + - audio.mkv + - video_language.txt + - metadata.json + - statefile (new/done/error) +``` + +### audio.mkv +Preprocessed input file. Contains only audio data to conserve disk space. + +### video_language.txt +Contains the video language tag used for processing with [whisper-webvtt-transcriber](https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber). +Is used by the cronjob script (step 3). + +### metadata.json +Used by mailservice (step 4). +```json +{ + "email": "example@example.com", + "language": "de", + "video_language": "de", + "filename": "original_filename.original_file_extension" +} +``` + +### statefile +State is tracked through the following files in the jobs folder: + - new: Job has been submitted by user + - submitted: Job has been scheduled on gpu cluster + - done: Job has been processed without errors + - error: Job has been processed with errors \ No newline at end of file diff --git a/config.example.toml b/config.example.toml index 484a2b7834caf2f416e76535d6a369a60f0ea7de..c781ab523372b9db904c2ff82ca4003df9fe8836 100644 --- a/config.example.toml +++ b/config.example.toml @@ -1,15 +1,28 @@ +# Entries in comments are optional + SECRET_KEY = "your-secret-key" UPLOAD_FOLDER = "uploads" MAX_CONTENT_LENGTH = 10 # in GB ENABLED_LOCALISATIONS = [ "de", "en" ] +DEFAULT_LANGUAGE = "de" +MAIL_DOMAIN = "@example.com" + +MAILSERVICE_INTERVAL = 300 # in seconds +# MONITORING_MAIL = "john.smith@example.com" -CONTACT_ORG = "Fun Inc." -CONTACT_NAME = "John Smith" -CONTACT_MAIL = "john.smith@example.com" +[ CONTACT ] +ORG = "Fun Inc." +NAME = "John Smith" +MAIL = "john.smith@example.com" [ MAIL ] FROM = "funinc@example.com" SERVER = "smtp.example.com" PORT = 25 -# LOCAL_HOSTNAME: Set local hostname when talking to SMTP Server \ No newline at end of file +# LOCAL_HOSTNAME: Set local hostname when talking to SMTP Server + +#[ METRICS ] +#URL = "http://localhost:8080/telegraf" +#USER = "basic_auth_user" +#PASS = "basic_auth_password" \ No newline at end of file diff --git a/forms.py b/forms.py index 27a9654b6f28e82112b5dbb98d529c751bcc2e8a..8b251e7191fd7066ab4e48ac385106c16225982d 100644 --- a/forms.py +++ b/forms.py @@ -19,7 +19,7 @@ def validate_audio(_, field): if not has_audio: raise ValidationError("noaudiotrack") except av.AVError as e: - current_app.logger.error( + current_app.logger.info( "Error while checking audio of file '{}': {}".format(file.filename, str(e)) ) raise ValidationError("brokenfile") @@ -27,15 +27,15 @@ def validate_audio(_, field): class UploadForm(FlaskForm): email = StringField( - current_app.config["LOCALISATIONS"]["email"]["label"]["de"], + current_app.config["LOCALISATIONS"]["email"]["label"][current_app.config["DEFAULT_LANGUAGE"]], validators=[InputRequired("invalidEmail"), Email("invalidEmail")], ) language = SelectField( - current_app.config["LOCALISATIONS"]["language"]["label"]["de"], + current_app.config["LOCALISATIONS"]["language"]["label"][current_app.config["DEFAULT_LANGUAGE"]], validators=[InputRequired("required")], ) file = FileField( - current_app.config["LOCALISATIONS"]["file"]["label"]["de"], + current_app.config["LOCALISATIONS"]["file"]["label"][current_app.config["DEFAULT_LANGUAGE"]], validators=[ FileRequired("nofile"), FileSize(current_app.config["MAX_CONTENT_LENGTH"]), diff --git a/localisations.toml b/localisations.toml index 1a217b84b26af6131491a010d514efbd7ea78ec1..fe269ac46ff6ccca3a45ff180c6a2ef065a8a8ca 100644 --- a/localisations.toml +++ b/localisations.toml @@ -3,8 +3,8 @@ de = "PTB Untertitel Dienst" en = "PTB Subtitle Service" [ leadtext ] -de = "Dieser Dienst erstellt automatisch Untertitel für Videos,<br>welche z.B. für PTB-Tube genutzt werden können.<br>Die KI-Verarbeitung läuft im HPC-Cluster der PTB." -en = "This service automatically creates subtitles for videos,<br>which can, for example, be used for PTB-Tube.<br>The AI processing runs in PTB's HPC cluster." +de = "Dieser Dienst erstellt automatisch Untertitel für Videos,<br>welche z.B. für <a class='link-dark link-offset-2 link-offset-3-hover link-underline link-underline-opacity-0 link-underline-opacity-75-hover' href='https://tube.ptb.de'>PTB-Tube</a> genutzt werden können.<br>Die KI-Verarbeitung läuft im HPC-Cluster der PTB." +en = "This service automatically creates subtitles for videos,<br>which can, for example, be used for <a class='link-dark link-offset-2 link-offset-3-hover link-underline link-underline-opacity-0 link-underline-opacity-75-hover' href='https://tube.ptb.de'>PTB-Tube</a>.<br>The AI processing runs in PTB's HPC cluster." [ successtext] de = "Daten erfolgreich übermittelt. Ihre Untertitel werden demnächst generiert.<br>Sobald der Prozess abgeschlossen ist, erhalten Sie eine E-Mail." @@ -87,6 +87,10 @@ en = "Error during processing. File might be corrupt or format is unsupported." de = "Untertitel erstellen" en = "Create subtitle" +[ return ] +de = "Zurück" +en = "Return" + [ contact.text ] de = "Kontakt" en = "Contact" diff --git a/mailservice.py b/mailservice.py index b5ed896d4115e9f545874eb49b67c469d44acd2a..5ad9936f5f586561d9260f0b74ecac9eec51c4f3 100644 --- a/mailservice.py +++ b/mailservice.py @@ -5,72 +5,138 @@ import tomllib from email.message import EmailMessage from os import scandir from pathlib import Path -import signal -import time -run = True - - -def handler(signum, frame): - global run - run = False - - -signal.signal(signal.SIGINT, handler) -signal.signal(signal.SIGTERM, handler) - - -with open("config.toml", "rb") as f: - config = tomllib.load(f) - -with open("localisations.toml", "rb") as f: - localisations = tomllib.load(f) - - -while run: - # gather jobs - finished_jobs = [] - with scandir(config["UPLOAD_FOLDER"]) as uploads: - for entry in uploads: - if entry.is_dir(): - with scandir(entry.path) as job: - for file in job: - if file.is_file() and file.name == "done": - finished_jobs.append(entry.path) - break - - # send emails - sent = 0 - with smtplib.SMTP( - host=config["MAIL"]["SERVER"], - port=config["MAIL"]["PORT"], - local_hostname=config["MAIL"]["LOCAL_HOSTNAME"] if config["MAIL"]["LOCAL_HOSTNAME"] else None, - ) as s: - for job in finished_jobs: - sent += 1 - with open(Path(job).joinpath("metadata.json")) as f: - metadata = json.load(f) - - language = metadata["language"] +import requests +from requests.auth import HTTPBasicAuth + + +def main(end): + with open("config.toml", "rb") as f: + config = tomllib.load(f) + + with open("localisations.toml", "rb") as f: + localisations = tomllib.load(f) + + while not end.is_set(): + metrics = {} + + # gather jobs + completed_jobs = [] + error_jobs = [] + with scandir(config["UPLOAD_FOLDER"]) as uploads: + for entry in uploads: + if entry.is_dir(): + with scandir(entry.path) as job: + for file in job: + if file.is_file(): + if file.name == "done": + completed_jobs.append(entry.path) + break + elif file.name == "error": + error_jobs.append(entry.path) + break + + metrics["total_finished_jobs"] = len(completed_jobs) + metrics["current_job_errors"] = len(error_jobs) + + try: + local_hostname = config["MAIL"]["LOCAL_HOSTNAME"] + except KeyError: + local_hostname = None + + s = smtplib.SMTP( + host=config["MAIL"]["SERVER"], port=config["MAIL"]["PORT"], local_hostname=local_hostname + ) + + sent = 0 + with s: + if len(completed_jobs) > 0: + metrics["completed_job_languages"] = {} + for job in completed_jobs: + sent += 1 + with open(Path(job).joinpath("metadata.json")) as f: + metadata = json.load(f) + + try: + metrics["completed_job_languages"][metadata["video_language"]] += 1 + except KeyError: + metrics["completed_job_languages"][metadata["video_language"]] = 1 + + language = metadata["language"] + + msg = EmailMessage() + msg["Subject"] = localisations["mail"]["subject"][language] + msg["From"] = config["MAIL"]["FROM"] + msg["To"] = metadata["email"] + + msg.set_content( + localisations["mail"]["content"][language].format(metadata["filename"]) + ) + + # filename.language.vtt + filename = ( + Path(metadata["filename"]) + .with_suffix(".{}.vtt".format(metadata["video_language"])) + .name + ) + + with open(Path(job).joinpath("subtitles.vtt")) as f: + msg.add_attachment(f.read(), filename=filename) + + s.send_message(msg) + + shutil.rmtree(job) + + if len(error_jobs) > 0: + try: + msg = EmailMessage() + msg["Subject"] = "Subtitle Service Error Report" + msg["From"] = config["MAIL"]["FROM"] + msg["To"] = config["MONITORING"]["MAIL"] + + job_uuids = [] + for job in error_jobs: + job_uuids.append(Path(job).name) + + msg.set_content( + "The following jobs currently have errors:\n{}".format("\n - ".join(job_uuids)) + ) + + except KeyError: + pass + + try: + try: + auth = HTTPBasicAuth(config["METRICS"]["USER"], config["MONITORING"]["PASS"]) + except KeyError: + auth = None + + requests.post(config["METRICS"]["URL"], json=metrics, auth=auth) + + except KeyError: + pass + + print( + "[MAILSERVICE] Sent {} mails. Sleeping for {} seconds.".format( + sent, config["MAILSERVICE_INTERVAL"] + ) + ) - msg = EmailMessage() - msg["Subject"] = localisations["mail"]["subject"][language] - msg["From"] = config["MAIL"]["FROM"] - msg["To"] = metadata["email"] + end.wait(config["MAILSERVICE_INTERVAL"]) - msg.set_content(localisations["mail"]["content"][language].format(metadata["filename"])) - # filename.language.vtt - filename = ( - Path(metadata["filename"]).with_suffix(".{}.vtt".format(metadata["video_language"])).name - ) +if __name__ == "__main__": + import signal + from threading import Event - with open(Path(job).joinpath("subtitles.vtt")) as f: - msg.add_attachment(f.read(), filename=filename) + end = Event() - s.send_message(msg) + def handler(signum, frame): + global end + print(signum) + end.set() - shutil.rmtree(job) + signal.signal(signal.SIGINT, handler) + signal.signal(signal.SIGTERM, handler) - print("[MAILSERVICE] Sent {} mails. Sleeping for 5 minutes.".format(sent)) - time.sleep(300) + main(end) diff --git a/monitoring.md b/monitoring.md new file mode 100644 index 0000000000000000000000000000000000000000..b9950a099d206307c77af9f5301d8f8aba049daf --- /dev/null +++ b/monitoring.md @@ -0,0 +1,19 @@ +# Monitoring + +## Job errors +The mail service can send out emails for jobs with errors. +Currently, it will send out a summary of all jobs with errors every time it runs. + +## Metrics +If configured, the mailservice will export metrics to a webserver by POSTing a summary of the executed jobs and errors as json. + +```json +{ + "completed_job_languages": { + "de": 3, + "en": 1 + }, + "total_finished_jobs": 4, + "current_job_errors": 0 +} +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a92dd520f143881797502aa2db7b46f4dbe27ac8..8df2733da7f81614c39aee0e322fcd600d3b7103 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ av~=10.0.0 Flask~=2.3.2 Flask-WTF~=1.1.1 wtforms[email]~=3.0.1 -whitenoise~=6.5.0 \ No newline at end of file +whitenoise~=6.5.0 +requests \ No newline at end of file diff --git a/routes.py b/routes.py index fdb1e06abc9ddec34c2294bd359891342b1889e7..1167e8151a37ca7052fbe70fe1bde0e85abc90c6 100644 --- a/routes.py +++ b/routes.py @@ -2,10 +2,10 @@ import json from os import path, makedirs from uuid import uuid4 +import av from flask import current_app, render_template, abort, url_for, redirect, request, stream_with_context from forms import UploadForm -import av @current_app.route("/upload/<string:language>", methods=["GET", "POST"]) @@ -18,8 +18,8 @@ def upload(language: str): ] if form.is_submitted(): - if form.email.data: - form.email.data += "@ptb.de" + if not form.email.data.endswith(current_app.config["MAIL_DOMAIN"]): + form.email.data += current_app.config["MAIL_DOMAIN"] if form.validate(): @@ -73,9 +73,9 @@ def upload(language: str): return process_file() else: if form.email.data: - form.email.data = form.email.data[:-7] + form.email.data = form.email.data[:-len(current_app.config["MAIL_DOMAIN"])] - if language != "de": + if language != current_app.config["DEFAULT_LANGUAGE"]: form.email.label.text = current_app.config["LOCALISATIONS"]["email"]["label"][language] form.language.label.text = current_app.config["LOCALISATIONS"]["language"]["label"][language] form.file.label.text = current_app.config["LOCALISATIONS"]["file"]["label"][language] @@ -86,7 +86,7 @@ def upload(language: str): @current_app.route("/") @current_app.route("/upload/") def default(): - return redirect(url_for("upload", language="de")) + return redirect(url_for("upload", language=current_app.config["DEFAULT_LANGUAGE"])) def set_language(language): diff --git a/static/img/arrow-left.svg b/static/img/arrow-left.svg new file mode 100644 index 0000000000000000000000000000000000000000..9d885017c34e499b41a8e038583b62b3a7433aaf --- /dev/null +++ b/static/img/arrow-left.svg @@ -0,0 +1,3 @@ +<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-left" viewBox="0 0 16 16"> + <path fill-rule="evenodd" d="M15 8a.5.5 0 0 0-.5-.5H2.707l3.147-3.146a.5.5 0 1 0-.708-.708l-4 4a.5.5 0 0 0 0 .708l4 4a.5.5 0 0 0 .708-.708L2.707 8.5H14.5A.5.5 0 0 0 15 8z"/> +</svg> \ No newline at end of file diff --git a/static/img/arrow-left.svg.gz b/static/img/arrow-left.svg.gz new file mode 100644 index 0000000000000000000000000000000000000000..f3207160d6e80dee4d8ace0064823ba84187dbab Binary files /dev/null and b/static/img/arrow-left.svg.gz differ diff --git a/static/img/arrow-repeat.svg.gz b/static/img/arrow-repeat.svg.gz index 783a8a12510e1e1f24853380fa8b7ed5fe003151..ad3b6415c18e870a66778ff79878bcf06536fd4c 100644 Binary files a/static/img/arrow-repeat.svg.gz and b/static/img/arrow-repeat.svg.gz differ diff --git a/templates/base.html b/templates/base.html index f8b4eeb685cc8fd4e132d2fd24865fb67624bde0..309091b7ac937df5ebaf311782478dfcaa069fa1 100644 --- a/templates/base.html +++ b/templates/base.html @@ -57,7 +57,7 @@ </p> <br> <p class="mb-1">{{ config["LOCALISATIONS"]["contact"]["text"][request.language] }}:</p> - <p class="mb-1">{{ config["CONTACT_ORG"] }} <a class="link-secondary" href="mailto:{{ config[" CONTACT_MAIL"]}}">{{ config["CONTACT_NAME"] }}</a></p> + <p class="mb-1">{{ config["CONTACT"]["ORG"] }} <a class="link-secondary" href="mailto:{{ config["CONTACT"]["MAIL"] }}">{{ config["CONTACT"]["NAME"] }}</a></p> <p class="font-monospace"><a class="link-secondary" target="_blank" referrerpolicy="no-referrer" href="https://gitlab1.ptb.de/janhartig/whisper-webvtt-transcriber">whisper-webvtt-transcriber</a></p> </div> </footer> diff --git a/templates/success.html b/templates/success.html index 72e6595c2462315addb8af9b43e3d0d18d496846..e8c59bd2563e6df7f7ec28e3bf69e6411892339d 100644 --- a/templates/success.html +++ b/templates/success.html @@ -1,11 +1,14 @@ {% extends "base.html" %} {% block content %} -<div class="py-5 text-center"> - <img class="mx-auto mb-4 spinner processing transition-opacity" src="{{ url_for('static', filename='img/arrow-repeat.svg') }}" alt="Checkmark" height="136"> - <img class="mx-auto mb-4 success hidden transition-opacity" src="{{ url_for('static', filename='img/cloud-check.svg') }}" alt="Checkmark" height="136" hidden> - <p class="pt-4 lead processing transition-opacity">{% autoescape false %}{{config["LOCALISATIONS"]["processingtext"][request.language]}}{% endautoescape %}</p> - <p class="pt-4 lead success hidden transition-opacity" hidden>{% autoescape false %}{{config["LOCALISATIONS"]["successtext"][request.language]}}{% endautoescape %}</p> +<div class="pt-5 text-center"> + <img class="mx-auto mb-4 spinner processing transition-opacity" src="{{ url_for('static', filename='img/arrow-repeat.svg') }}" alt="Spinning arrow" height="136" aria-hidden="true"> + <img class="mx-auto mb-4 success hidden transition-opacity" src="{{ url_for('static', filename='img/cloud-check.svg') }}" alt="Checkmark" height="136" aria-hidden="true" hidden> + <p class="pt-4 lead processing transition-opacity">{% autoescape false %}{{ config["LOCALISATIONS"]["processingtext"][request.language] }}{% endautoescape %}</p> + <p class="pt-4 lead success hidden transition-opacity" hidden>{% autoescape false %}{{ config["LOCALISATIONS"]["successtext"][request.language] }}{% endautoescape %}</p> + <a class="pt-4 fs-1 lead success hidden transition-opacity icon-link icon-link-hover link-offset-2" style="--bs-icon-link-transform: translate3d(-.125rem, 0, 0);" href="{{ url_for('upload', language=request.language) }}"> + <img class="bi" aria-label='{{ config["LOCALISATIONS"]["return"][request.language] }}' src="{{ url_for('static', filename='img/arrow-left.svg') }}" alt="Return arrow" height="4em"> + </a> </div> {% endblock %}