learningequality · rtibbles · Sep 6, 2022 · Aug 30, 2022 · Aug 31, 2022 · Aug 31, 2022
diff --git a/Makefile b/Makefile
@@ -141,6 +141,9 @@ filedurations:
 learningactivities:
 	python contentcuration/manage.py set_default_learning_activities
 
+hascaptions:
+	python contentcuration/manage.py set_orm_based_has_captions
+
 export COMPOSE_PROJECT_NAME=studio_$(shell git rev-parse --abbrev-ref HEAD)
 
 purge-postgres:

diff --git a/contentcuration/contentcuration/management/commands/set_file_duration.py b/contentcuration/contentcuration/management/commands/set_file_duration.py
@@ -14,7 +14,7 @@
 CHUNKSIZE = 10000
 
 
-def extract_duration_of_media(f_in, extension):
+def extract_duration_of_media(f_in, extension):  # noqa C901
     """
     For more details on these commands, refer to the ffmpeg Wiki:
     https://trac.ffmpeg.org/wiki/FFprobeTips#Formatcontainerduration
@@ -55,9 +55,12 @@ def extract_duration_of_media(f_in, extension):
             stdin=f_in,
             stderr=subprocess.PIPE
         )
-        second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2]
-        time_code = second_last_line.split(" time=")[1].split(" ")[0]
-        hours, minutes, seconds = time_code.split(":")
+        try:
+            second_last_line = result.stderr.decode("utf-8").strip().splitlines()[-2]
+            time_code = second_last_line.split(" time=")[1].split(" ")[0]
+            hours, minutes, seconds = time_code.split(":")
+        except IndexError:
+            raise RuntimeError("Unable to determine media length")
         try:
             hours = int(hours)
         except ValueError:
@@ -103,7 +106,7 @@ def handle(self, *args, **options):
                 except FileNotFoundError:
                     logging.warning("File {} not found".format(file))
                     excluded_files.add(file.file_on_disk.name)
-                except subprocess.CalledProcessError:
+                except (subprocess.CalledProcessError, RuntimeError):
                     logging.warning("File {} could not be read for duration".format(file))
                     excluded_files.add(file.file_on_disk.name)
 

diff --git a/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py b/contentcuration/contentcuration/management/commands/set_orm_based_has_captions.py
@@ -0,0 +1,49 @@
+import logging as logmodule
+import time
+
+from django.core.management.base import BaseCommand
+from django.db.models import Exists
+from django.db.models import OuterRef
+from le_utils.constants import content_kinds
+from le_utils.constants import format_presets
+from le_utils.constants.labels import accessibility_categories
+
+from contentcuration.models import ContentNode
+from contentcuration.models import File
+
+logmodule.basicConfig(level=logmodule.INFO)
+logging = logmodule.getLogger('command')
+
+
+CHUNKSIZE = 10000
+
+
+class Command(BaseCommand):
+
+    def handle(self, *args, **options):
+        start = time.time()
+
+        logging.info("Setting 'has captions' for video kinds")
+
+        has_captions_subquery = Exists(File.objects.filter(contentnode=OuterRef("id"), language=OuterRef("language"), preset_id=format_presets.VIDEO_SUBTITLE))
+        # Only try to update video nodes which have not had any accessibility labels set on them
+        # this will allow this management command to be rerun and resume from where it left off
+        # and also prevent stomping previous edits to the accessibility_labels field.
+        updateable_nodes = ContentNode.objects.filter(has_captions_subquery, kind=content_kinds.VIDEO, accessibility_labels__isnull=True)
+
+        updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]
+
+        count = 0
+
+        while updateable_nodes.exists():
+            this_count = ContentNode.objects.filter(
+                id__in=updateable_node_slice
+            ).update(accessibility_labels={accessibility_categories.CAPTIONS_SUBTITLES: True})
+
+            logging.info("Set has captions metadata for {} nodes".format(this_count))
+
+            count += this_count
+
+            updateable_node_slice = updateable_nodes.values_list("id", flat=True)[0:CHUNKSIZE]
+
+        logging.info('Finished setting all has captions metadata for {} nodes in {} seconds'.format(count, time.time() - start))