diff --git a/Makefile b/Makefile index 383bd3ad32..282b90de57 100644 --- a/Makefile +++ b/Makefile @@ -141,6 +141,9 @@ filedurations: learningactivities: python contentcuration/manage.py set_default_learning_activities +hascaptions: + python contentcuration/manage.py set_orm_based_has_captions + export COMPOSE_PROJECT_NAME=studio_$(shell git rev-parse --abbrev-ref HEAD) purge-postgres: diff --git a/contentcuration/contentcuration/management/commands/set_file_duration.py b/contentcuration/contentcuration/management/commands/set_file_duration.py index fd09aaf51d..1e828dac05 100644 --- a/contentcuration/contentcuration/management/commands/set_file_duration.py +++ b/contentcuration/contentcuration/management/commands/set_file_duration.py @@ -14,7 +14,7 @@ CHUNKSIZE = 10000 -def extract_duration_of_media(f_in, extension): +def extract_duration_of_media(f_in, extension): # noqa C901 """ For more details on these commands, refer to the ffmpeg Wiki: https://trac.ffmpeg.org/wiki/FFprobeTips#Formatcontainerduration @@ -55,9 +55,12 @@ def extract_duration_of_media(f_in, extension): stdin=f_in, stderr=subprocess.PIPE ) - second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2] - time_code = second_last_line.split(" time=")[1].split(" ")[0] - hours, minutes, seconds = time_code.split(":") + try: + second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2] + time_code = second_last_line.split(" time=")[1].split(" ")[0] + hours, minutes, seconds = time_code.split(":") + except IndexError: + raise RuntimeError("Unable to determine media length") try: hours = int(hours) except ValueError: @@ -103,7 +106,7 @@ def handle(self, *args, **options): except FileNotFoundError: logging.warning("File {} not found".format(file)) excluded_files.add(file.file_on_disk.name) - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, RuntimeError): logging.warning("File {} could not be read for duration".format(file)) excluded_files.add(file.file_on_disk.name) diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py new file mode 100644 index 0000000000..edbcbbcd40 --- /dev/null +++ b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py @@ -0,0 +1,49 @@ +import logging as logmodule +import time + +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import OuterRef +from le_utils.constants import content_kinds +from le_utils.constants import format_presets +from le_utils.constants.labels import accessibility_categories + +from contentcuration.models import ContentNode +from contentcuration.models import File + +logmodule.basicConfig(level=logmodule.INFO) +logging = logmodule.getLogger('command') + + +CHUNKSIZE = 10000 + + +class Command(BaseCommand): + + def handle(self, *args, **options): + start = time.time() + + logging.info("Setting 'has captions' for video kinds") + + has_captions_subquery = Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE)) + # Only try to update video nodes which have not had any accessibility labels set on them + # this will allow this management command to be rerun and resume from where it left off + # and also prevent stomping previous edits to the accessibility_labels field. + updateable_nodes = ContentNode.objects.filter(has_captions_subquery, kind=content_kinds.VIDEO, accessibility_labels__isnull=True) + + updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE] + + count = 0 + + while updateable_nodes.exists(): + this_count = ContentNode.objects.filter( + id__in=updateable_node_slice + ).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True}) + + logging.info("Set has captions metadata for {} nodes".format(this_count)) + + count += this_count + + updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE] + + logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start)) diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/gcs_storage.py index bef04a7798..07d8e899f3 100644 --- a/contentcuration/contentcuration/utils/gcs_storage.py +++ b/contentcuration/contentcuration/utils/gcs_storage.py @@ -58,6 +58,9 @@ def open(self, name, mode="rb", blob_object=None): else: blob = blob_object + if blob is None: + raise FileNotFoundError("{} not found".format(name)) + fobj = tempfile.NamedTemporaryFile() blob.download_to_file(fobj) # flush it to disk