diff --git a/Makefile b/Makefile index 29fe984285..99b55d3762 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ learningactivities: set-tsvectors: python contentcuration/manage.py set_channel_tsvectors - python contentcuration/manage.py set_contentnode_tsvectors + python contentcuration/manage.py set_contentnode_tsvectors --published ############################################################### # END PRODUCTION COMMANDS ##################################### diff --git a/contentcuration/contentcuration/debug/middleware.py b/contentcuration/contentcuration/debug/middleware.py deleted file mode 100644 index 803a94f89b..0000000000 --- a/contentcuration/contentcuration/debug/middleware.py +++ /dev/null @@ -1,47 +0,0 @@ -import threading -import time - -import debug_panel.urls -from debug_panel.cache import cache -from debug_panel.middleware import DebugPanelMiddleware -from django.urls import reverse - - -class CustomDebugPanelMiddleware(DebugPanelMiddleware): - """ - Custom version to fix SQL escaping: - https://github.com/recamshak/django-debug-panel/issues/17#issuecomment-366268893 - """ - - def process_response(self, request, response): - """ - Store the DebugToolbarMiddleware rendered toolbar into a cache store. - The data stored in the cache are then reachable from an URL that is appened - to the HTTP response header under the 'X-debug-data-url' key. - """ - toolbar = self.__class__.debug_toolbars.get( - threading.current_thread().ident, None - ) - - response = super(DebugPanelMiddleware, self).process_response(request, response) - - if toolbar: - # for django-debug-toolbar >= 1.4 - for panel in reversed(toolbar.enabled_panels): - if ( - hasattr(panel, "generate_stats") and not panel.get_stats() - ): # PATCH HERE - panel.generate_stats(request, response) - - cache_key = "%f" % time.time() - cache.set(cache_key, toolbar.render_toolbar()) - - response["X-debug-data-url"] = request.build_absolute_uri( - reverse( - "debug_data", - urlconf=debug_panel.urls, - kwargs={"cache_key": cache_key}, - ) - ) - - return response diff --git a/contentcuration/contentcuration/debug_panel_settings.py b/contentcuration/contentcuration/debug_panel_settings.py index 79f9ddac6e..c097acbbc6 100644 --- a/contentcuration/contentcuration/debug_panel_settings.py +++ b/contentcuration/contentcuration/debug_panel_settings.py @@ -1,8 +1,13 @@ from .dev_settings import * # noqa -# These endpoints will throw an error on the django debug panel +# These endpoints will throw an error on the django debug panel. EXCLUDED_DEBUG_URLS = [ "/content/storage", + + # Disabling sync API because as soon as the sync API gets polled + # the current request data gets overwritten. + # Can be removed after websockets deployment. + "/api/sync", ] DEBUG_PANEL_ACTIVE = True @@ -14,10 +19,10 @@ def custom_show_toolbar(request): ) # noqa F405 -# if debug_panel exists, add it to our INSTALLED_APPS +# if debug_panel exists, add it to our INSTALLED_APPS. INSTALLED_APPS += ("debug_panel", "debug_toolbar", "pympler") # noqa F405 MIDDLEWARE += ( # noqa F405 - "contentcuration.debug.middleware.CustomDebugPanelMiddleware", + "debug_toolbar.middleware.DebugToolbarMiddleware", ) DEBUG_TOOLBAR_CONFIG = { "SHOW_TOOLBAR_CALLBACK": custom_show_toolbar, diff --git a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue index 60efe87276..5d0447df50 100644 --- a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue +++ b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/BrowsingCard.vue @@ -149,7 +149,7 @@ if (this.isTopic) { return `${baseUrl}#/${this.node.id}`; } - return `${baseUrl}#/${this.node.parent}/${this.node.id}`; + return `${baseUrl}#/${this.node.parent_id}/${this.node.id}`; }, resourcesMsg() { let count; @@ -160,13 +160,8 @@ } return this.$tr('resourcesCount', { count }); }, - numLocations() { - return this.node.location_ids.length; - }, goToLocationLabel() { - return this.numLocations > 1 - ? this.$tr('goToPluralLocationsAction', { count: this.numLocations }) - : this.$tr('goToSingleLocationAction'); + return this.$tr('goToSingleLocationAction'); }, isTopic() { return this.node.kind === ContentKindsNames.TOPIC; @@ -189,8 +184,6 @@ $trs: { tagsList: 'Tags: {tags}', goToSingleLocationAction: 'Go to location', - goToPluralLocationsAction: - 'In {count, number} {count, plural, one {location} other {locations}}', addToClipboardAction: 'Copy to clipboard', resourcesCount: '{count, number} {count, plural, one {resource} other {resources}}', coach: 'Resource for coaches', diff --git a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue index 76dc9e1f8c..43de8e8ce4 100644 --- a/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue +++ b/contentcuration/contentcuration/frontend/channelEdit/views/ImportFromChannels/ChannelList.vue @@ -112,6 +112,7 @@ [this.channelFilter]: true, page: this.$route.query.page || 1, exclude: this.currentChannelId, + published: true, }).then(page => { this.pageCount = page.total_pages; this.channels = page.results; diff --git a/contentcuration/contentcuration/tests/helpers.py b/contentcuration/contentcuration/tests/helpers.py index 8e3172fcd4..cf1d54130e 100644 --- a/contentcuration/contentcuration/tests/helpers.py +++ b/contentcuration/contentcuration/tests/helpers.py @@ -2,7 +2,9 @@ from importlib import import_module import mock +from search.models import ContentNodeFullTextSearch +from contentcuration.models import ContentNode from contentcuration.models import TaskResult @@ -39,6 +41,12 @@ def mock_class_instance(target): else: target_cls = target + # ContentNode's node_fts field can be handled by Django when tests + # access the database but we mock it so that we don't need to query + # the database. By doing so we get faster test execution. + if type(target_cls) is ContentNode: + target_cls.node_fts = ContentNodeFullTextSearch() + class MockClass(target_cls): def __new__(cls, *args, **kwargs): return mock.Mock(spec_set=cls) diff --git a/contentcuration/contentcuration/tests/utils/test_cache.py b/contentcuration/contentcuration/tests/utils/test_cache.py index 5327da19ac..d16570648a 100644 --- a/contentcuration/contentcuration/tests/utils/test_cache.py +++ b/contentcuration/contentcuration/tests/utils/test_cache.py @@ -1,15 +1,14 @@ import mock -from django.test import TestCase +from django.test import SimpleTestCase from ..helpers import mock_class_instance -from contentcuration.models import ContentNode from contentcuration.utils.cache import ResourceSizeCache -class ResourceSizeCacheTestCase(TestCase): +class ResourceSizeCacheTestCase(SimpleTestCase): def setUp(self): super(ResourceSizeCacheTestCase, self).setUp() - self.node = mock.Mock(spec_set=ContentNode()) + self.node = mock_class_instance("contentcuration.models.ContentNode") self.node.pk = "abcdefghijklmnopqrstuvwxyz" self.redis_client = mock_class_instance("redis.client.StrictRedis") self.cache_client = mock_class_instance("django_redis.client.DefaultClient") diff --git a/contentcuration/contentcuration/tests/utils/test_nodes.py b/contentcuration/contentcuration/tests/utils/test_nodes.py index 83171288d6..be43d295dd 100644 --- a/contentcuration/contentcuration/tests/utils/test_nodes.py +++ b/contentcuration/contentcuration/tests/utils/test_nodes.py @@ -6,10 +6,10 @@ from dateutil.parser import isoparse from django.db.models import F from django.db.models import Max -from django.test import TestCase +from django.test import SimpleTestCase from ..base import StudioTestCase -from contentcuration.models import ContentNode +from contentcuration.tests.helpers import mock_class_instance from contentcuration.utils.nodes import calculate_resource_size from contentcuration.utils.nodes import ResourceSizeHelper from contentcuration.utils.nodes import SlowCalculationError @@ -42,10 +42,10 @@ def test_modified_since(self): @mock.patch("contentcuration.utils.nodes.ResourceSizeHelper") @mock.patch("contentcuration.utils.nodes.ResourceSizeCache") -class CalculateResourceSizeTestCase(TestCase): +class CalculateResourceSizeTestCase(SimpleTestCase): def setUp(self): super(CalculateResourceSizeTestCase, self).setUp() - self.node = mock.Mock(spec_set=ContentNode()) + self.node = mock_class_instance("contentcuration.models.ContentNode") def assertCalculation(self, cache, helper, force=False): helper().get_size.return_value = 456 diff --git a/contentcuration/contentcuration/utils/publish.py b/contentcuration/contentcuration/utils/publish.py index 66e8b37e49..5b38d85576 100644 --- a/contentcuration/contentcuration/utils/publish.py +++ b/contentcuration/contentcuration/utils/publish.py @@ -18,8 +18,11 @@ from django.core.files.storage import default_storage as storage from django.core.management import call_command from django.db.models import Count +from django.db.models import Exists from django.db.models import Max +from django.db.models import OuterRef from django.db.models import Q +from django.db.models import Subquery from django.db.models import Sum from django.db.utils import IntegrityError from django.template.loader import render_to_string @@ -37,6 +40,10 @@ from le_utils.constants import roles from past.builtins import basestring from past.utils import old_div +from search.models import ChannelFullTextSearch +from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_annotated_channel_qs +from search.utils import get_fts_annotated_contentnode_qs from contentcuration import models as ccmodels from contentcuration.decorators import delay_user_storage_calculation @@ -808,6 +815,50 @@ def fill_published_fields(channel, version_notes): channel.save() +def sync_contentnode_and_channel_tsvectors(channel_id): + """ + Creates, deletes and updates tsvectors of the channel and all its content nodes + to reflect the current state of channel's main tree. + """ + # Update or create channel tsvector entry. + logging.info("Setting tsvector for channel with id {}.".format(channel_id)) + + channel = (get_fts_annotated_channel_qs() + .values("keywords_tsvector", "main_tree__tree_id") + .get(pk=channel_id)) + + obj, is_created = ChannelFullTextSearch.objects.update_or_create(channel_id=channel_id, defaults={"keywords_tsvector": channel["keywords_tsvector"]}) + del obj + + if is_created: + logging.info("Created 1 channel tsvector.") + else: + logging.info("Updated 1 channel tsvector.") + + # Update or create contentnodes tsvector entry for channel_id. + logging.info("Setting tsvectors for all main tree contentnodes in channel {}.".format(channel_id)) + + if ContentNodeFullTextSearch.objects.filter(channel_id=channel_id).exists(): + # First, delete nodes that are no longer in main_tree. + nodes_no_longer_in_main_tree = ~Exists(ccmodels.ContentNode.objects.filter(id=OuterRef("contentnode_id"), tree_id=channel["main_tree__tree_id"])) + ContentNodeFullTextSearch.objects.filter(nodes_no_longer_in_main_tree, channel_id=channel_id).delete() + + # Now, all remaining nodes are in main_tree, so let's update them. + # Update only changed nodes. + node_tsv_subquery = get_fts_annotated_contentnode_qs(channel_id).filter(id=OuterRef("contentnode_id")).order_by() + ContentNodeFullTextSearch.objects.filter(channel_id=channel_id, contentnode__complete=True, contentnode__changed=True).update( + keywords_tsvector=Subquery(node_tsv_subquery.values("keywords_tsvector")[:1]), + author_tsvector=Subquery(node_tsv_subquery.values("author_tsvector")[:1]) + ) + + # Insert newly created nodes. + # "set_contentnode_tsvectors" command is defined in "search/management/commands" directory. + call_command("set_contentnode_tsvectors", + "--channel-id={}".format(channel_id), + "--tree-id={}".format(channel["main_tree__tree_id"]), + "--complete") + + @delay_user_storage_calculation def publish_channel( user_id, @@ -829,8 +880,9 @@ def publish_channel( set_channel_icon_encoding(channel) kolibri_temp_db = create_content_database(channel, force, user_id, force_exercises, progress_tracker=progress_tracker) increment_channel_version(channel) - mark_all_nodes_as_published(channel) add_tokens_to_channel(channel) + sync_contentnode_and_channel_tsvectors(channel_id=channel.id) + mark_all_nodes_as_published(channel) fill_published_fields(channel, version_notes) # Attributes not getting set for some reason, so just save it here diff --git a/contentcuration/contentcuration/viewsets/channel.py b/contentcuration/contentcuration/viewsets/channel.py index 517e66f0fe..90b9a8f84d 100644 --- a/contentcuration/contentcuration/viewsets/channel.py +++ b/contentcuration/contentcuration/viewsets/channel.py @@ -24,6 +24,9 @@ from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import IntegerField +from search.models import ChannelFullTextSearch +from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_search_query from contentcuration.decorators import cache_no_user_data from contentcuration.models import Change @@ -119,23 +122,15 @@ def filter_deleted(self, queryset, name, value): return queryset.filter(deleted=value) def filter_keywords(self, queryset, name, value): - # TODO: Wait until we show more metadata on cards to add this back in - # keywords_query = self.main_tree_query.filter( - # Q(tags__tag_name__icontains=value) - # | Q(author__icontains=value) - # | Q(aggregator__icontains=value) - # | Q(provider__icontains=value) - # ) - return queryset.annotate( - # keyword_match_count=SQCount(keywords_query, field="content_id"), - primary_token=primary_token_subquery, - ).filter( - Q(name__icontains=value) - | Q(description__icontains=value) - | Q(pk__istartswith=value) - | Q(primary_token=value.replace("-", "")) - # | Q(keyword_match_count__gt=0) - ) + search_query = get_fts_search_query(value) + dash_replaced_search_query = get_fts_search_query(value.replace("-", "")) + + channel_keywords_query = (Exists(ChannelFullTextSearch.objects.filter( + Q(keywords_tsvector=search_query) | Q(keywords_tsvector=dash_replaced_search_query), channel_id=OuterRef("id")))) + contentnode_search_query = (Exists(ContentNodeFullTextSearch.objects.filter( + Q(keywords_tsvector=search_query) | Q(author_tsvector=search_query), channel_id=OuterRef("id")))) + + return queryset.filter(Q(channel_keywords_query) | Q(contentnode_search_query)) def filter_languages(self, queryset, name, value): languages = value.split(",") diff --git a/contentcuration/search/admin.py b/contentcuration/search/admin.py deleted file mode 100644 index 846f6b4061..0000000000 --- a/contentcuration/search/admin.py +++ /dev/null @@ -1 +0,0 @@ -# Register your models here. diff --git a/contentcuration/search/management/commands/set_channel_tsvectors.py b/contentcuration/search/management/commands/set_channel_tsvectors.py index d82f4848f5..68d7e17b51 100644 --- a/contentcuration/search/management/commands/set_channel_tsvectors.py +++ b/contentcuration/search/management/commands/set_channel_tsvectors.py @@ -7,11 +7,8 @@ from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.constants import CHANNEL_KEYWORDS_TSVECTOR from search.models import ChannelFullTextSearch - -from contentcuration.models import Channel -from contentcuration.viewsets.channel import primary_token_subquery +from search.utils import get_fts_annotated_channel_qs logmodule.basicConfig(level=logmodule.INFO) @@ -27,10 +24,8 @@ def handle(self, *args, **options): channel_not_already_inserted_query = ~Exists(ChannelFullTextSearch.objects.filter(channel_id=OuterRef("id"))) - channel_query = (Channel.objects.filter(channel_not_already_inserted_query, - deleted=False, main_tree__published=True) - .annotate(primary_channel_token=primary_token_subquery, - keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) + channel_query = (get_fts_annotated_channel_qs().filter(channel_not_already_inserted_query, + deleted=False, main_tree__published=True) .values("id", "keywords_tsvector")) insertable_channels = list(channel_query[:CHUNKSIZE]) diff --git a/contentcuration/search/management/commands/set_contentnode_tsvectors.py b/contentcuration/search/management/commands/set_contentnode_tsvectors.py index 4e5673d9ec..c5e78ca8d1 100644 --- a/contentcuration/search/management/commands/set_contentnode_tsvectors.py +++ b/contentcuration/search/management/commands/set_contentnode_tsvectors.py @@ -4,15 +4,11 @@ import logging as logmodule import time -from django.contrib.postgres.aggregates import StringAgg from django.core.management.base import BaseCommand from django.db.models import Exists from django.db.models import OuterRef -from search.constants import CONTENTNODE_AUTHOR_TSVECTOR -from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR from search.models import ContentNodeFullTextSearch - -from contentcuration.models import ContentNode +from search.utils import get_fts_annotated_contentnode_qs logmodule.basicConfig(level=logmodule.INFO) @@ -22,20 +18,39 @@ class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument("--channel-id", type=str, dest="channel_id", + help="The channel_id to annotate to the nodes. If not specified then each node's channel_id is queried and then annotated.") + parser.add_argument("--tree-id", type=int, dest="tree_id", + help="Set tsvectors for a specific tree_id nodes only. If not specified then tsvectors for all nodes of ContentNode table are set.") + parser.add_argument("--published", dest="published", action="store_true", help="Filters on whether node is published or not.") + parser.add_argument("--complete", dest="complete", action="store_true", help="Filters on whether node is complete or not.") - def handle(self, *args, **options): - start = time.time() + def get_tsvector_nodes_queryset(self, *args, **options): + tsvector_nodes_queryset = get_fts_annotated_contentnode_qs(channel_id=options["channel_id"]) + + if options["tree_id"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(tree_id=options["tree_id"]) + + if options["complete"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(complete=True) + + if options["published"]: + tsvector_nodes_queryset = tsvector_nodes_queryset.filter(published=True) tsvector_not_already_inserted_query = ~Exists(ContentNodeFullTextSearch.objects.filter(contentnode_id=OuterRef("id"))) + tsvector_nodes_queryset = (tsvector_nodes_queryset + .filter(tsvector_not_already_inserted_query, channel_id__isnull=False) + .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + + return tsvector_nodes_queryset + + def handle(self, *args, **options): + start = time.time() - tsvector_node_query = (ContentNode._annotate_channel_id(ContentNode.objects) - .annotate(contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), - keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, - author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR) - .filter(tsvector_not_already_inserted_query, published=True, channel_id__isnull=False) - .values("id", "channel_id", "keywords_tsvector", "author_tsvector").order_by()) + tsvector_nodes_queryset = self.get_tsvector_nodes_queryset(*args, **options) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + insertable_nodes_tsvector = list(tsvector_nodes_queryset[:CHUNKSIZE]) total_tsvectors_inserted = 0 while insertable_nodes_tsvector: @@ -54,6 +69,6 @@ def handle(self, *args, **options): logging.info("Inserted {} contentnode tsvectors.".format(current_inserts_count)) - insertable_nodes_tsvector = list(tsvector_node_query[:CHUNKSIZE]) + insertable_nodes_tsvector = list(tsvector_nodes_queryset[:CHUNKSIZE]) logging.info("Completed! Successfully inserted total of {} contentnode tsvectors in {} seconds.".format(total_tsvectors_inserted, time.time() - start)) diff --git a/contentcuration/search/serializers.py b/contentcuration/search/serializers.py deleted file mode 100644 index 4137c0e47f..0000000000 --- a/contentcuration/search/serializers.py +++ /dev/null @@ -1,18 +0,0 @@ -from contentcuration import models as cc_models -from rest_framework import serializers - - -class ContentSearchResultSerializer(serializers.ModelSerializer): - - class Meta: - model = cc_models.ContentNode - fields = ( - 'id', - 'original_channel_id', - 'source_channel_id', - 'title', - 'kind', - 'tags', - 'children', - 'tree_id' - ) diff --git a/contentcuration/search/tests.py b/contentcuration/search/tests.py deleted file mode 100644 index 2341fc9d1b..0000000000 --- a/contentcuration/search/tests.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import absolute_import - -from django.urls import reverse - -from contentcuration.tests import testdata -from contentcuration.tests.base import StudioAPITestCase - - -class SearchViewsetTestCase(StudioAPITestCase): - - def test_filter_exclude_channels(self): - user = testdata.user() - self.client.force_authenticate(user=user) - channel = testdata.channel() - channel.editors.add(user) - response = self.client.get( - reverse("search-list"), data={"exclude_channel": channel.id}, format="json", - ) - self.assertEqual(response.status_code, 200, response.content) - self.assertEqual(response.data["results"], []) - - def test_filter_channels_by_edit(self): - user = testdata.user() - self.client.force_authenticate(user=user) - channel = testdata.channel() - channel.editors.add(user) - response = self.client.get( - reverse("search-list"), data={"channel_list": "edit"}, format="json", - ) - self.assertEqual(response.status_code, 200, response.content) - self.assertNotEqual(response.data["results"], []) diff --git a/contentcuration/contentcuration/debug/__init__.py b/contentcuration/search/tests/__init__.py similarity index 100% rename from contentcuration/contentcuration/debug/__init__.py rename to contentcuration/search/tests/__init__.py diff --git a/contentcuration/search/tests/test_search.py b/contentcuration/search/tests/test_search.py new file mode 100644 index 0000000000..8489ece577 --- /dev/null +++ b/contentcuration/search/tests/test_search.py @@ -0,0 +1,112 @@ +from __future__ import absolute_import + +from django.urls import reverse + +from contentcuration.models import Channel +from contentcuration.models import ContentNode +from contentcuration.tests import testdata +from contentcuration.tests.base import StudioAPITestCase +from contentcuration.utils.publish import sync_contentnode_and_channel_tsvectors + + +def dummy_publish(channel): + channel_nodes = ContentNode.objects.filter(tree_id=channel.main_tree.tree_id) + for node in channel_nodes: + node.published = True + node.changed = False + node.save() + sync_contentnode_and_channel_tsvectors(channel_id=channel.id) + + +class SearchViewsetTestCase(StudioAPITestCase): + def setUp(self): + super().setUp() + self.channel = testdata.channel() + self.user = testdata.user() + self.channel.editors.add(self.user) + dummy_publish(self.channel) + + def test_filter_exclude_channels(self): + self.client.force_authenticate(user=self.user) + response = self.client.get( + reverse("search-list"), data={"exclude_channel": self.channel.id}, format="json", + ) + self.assertEqual(response.status_code, 200, response.content) + self.assertEqual(response.data["results"], []) + + def test_filter_channels_by_edit(self): + self.client.force_authenticate(user=self.user) + response = self.client.get( + reverse("search-list"), data={"channel_list": "edit"}, format="json", + ) + self.assertEqual(response.status_code, 200, response.content) + self.assertNotEqual(response.data["results"], []) + + def test_search(self): + users = [] + channels = [] + + # Create channels, users. + for i in range(4): + user = testdata.user(email="a{}@a.com".format(i)) + users.append(user) + + channel = Channel.objects.create(name="user_a{}_channel".format(i)) + channel.save() + channels.append(channel) + channel.editors.add(user) + + public_channel, editable_channel, viewable_channel, inaccessible_channel = channels + + # Create public video node. + public_video_node = testdata.node({ + "title": "Kolibri video", + "kind_id": "video", + }, parent=public_channel.main_tree) + public_channel.public = True + public_channel.save() + + # Set user_b viewable channel. + user_b = users[1] + viewable_channel.viewers.add(user_b) + + public_video_node.refresh_from_db() + public_video_node.copy_to(target=editable_channel.main_tree) + public_video_node.copy_to(target=viewable_channel.main_tree) + public_video_node.copy_to(target=inaccessible_channel.main_tree) + + # Publish all channels to make them searchable. + for channel in channels: + dummy_publish(channel) + + # Get different nodes based on access. + editable_channel.main_tree.refresh_from_db() + editable_video_node = editable_channel.main_tree.get_descendants().first() + viewable_channel.main_tree.refresh_from_db() + viewable_video_node = viewable_channel.main_tree.get_descendants().first() + inaccessible_channel.main_tree.refresh_from_db() + inaccessible_video_node = inaccessible_channel.main_tree.get_descendants().first() + + # Send request from user_b to the search endpoint. + self.client.force_authenticate(user=user_b) + + for channel_list in ("public", "edit", "view"): + response = self.client.get( + reverse("search-list"), + data={ + "channel_list": channel_list, + "keywords": "video" + }, + format="json", + ) + + result = response.data["results"][0] + + self.assertNotEqual(result["id"], inaccessible_video_node.id) + + if channel_list == "public": + self.assertEqual(result["id"], public_video_node.id) + elif channel_list == "edit": + self.assertEqual(result["id"], editable_video_node.id) + elif channel_list == "view": + self.assertEqual(result["id"], viewable_video_node.id) diff --git a/contentcuration/search/utils.py b/contentcuration/search/utils.py new file mode 100644 index 0000000000..4f6768f650 --- /dev/null +++ b/contentcuration/search/utils.py @@ -0,0 +1,48 @@ +from django.contrib.postgres.aggregates import StringAgg +from django.contrib.postgres.search import SearchQuery +from django.db.models import Value +from search.constants import CHANNEL_KEYWORDS_TSVECTOR +from search.constants import CONTENTNODE_AUTHOR_TSVECTOR +from search.constants import CONTENTNODE_KEYWORDS_TSVECTOR +from search.constants import POSTGRES_FTS_CONFIG + + +def get_fts_search_query(value): + """ + Returns a `SearchQuery` with our postgres full text search config set on it. + """ + return SearchQuery(value=value, config=POSTGRES_FTS_CONFIG) + + +def get_fts_annotated_contentnode_qs(channel_id=None): + """ + Returns a `ContentNode` queryset annotated with fields required for full text search. + + If `channel_id` is provided, annotates that specific `channel_id` else annotates + the `channel_id` to which the contentnode belongs. + """ + from contentcuration.models import ContentNode + + if channel_id: + queryset = ContentNode.objects.annotate(channel_id=Value(channel_id)) + else: + queryset = ContentNode._annotate_channel_id(ContentNode.objects) + + queryset = queryset.annotate( + contentnode_tags=StringAgg("tags__tag_name", delimiter=" "), + keywords_tsvector=CONTENTNODE_KEYWORDS_TSVECTOR, + author_tsvector=CONTENTNODE_AUTHOR_TSVECTOR + ) + + return queryset + + +def get_fts_annotated_channel_qs(): + """ + Returns a `Channel` queryset annotated with fields required for full text search. + """ + from contentcuration.models import Channel + from contentcuration.viewsets.channel import primary_token_subquery + + return Channel.objects.annotate(primary_channel_token=primary_token_subquery, + keywords_tsvector=CHANNEL_KEYWORDS_TSVECTOR) diff --git a/contentcuration/search/viewsets/contentnode.py b/contentcuration/search/viewsets/contentnode.py index 54b5f437aa..676698031d 100644 --- a/contentcuration/search/viewsets/contentnode.py +++ b/contentcuration/search/viewsets/contentnode.py @@ -1,41 +1,36 @@ import re -from django.db.models import Case +from django.db.models import ExpressionWrapper from django.db.models import F from django.db.models import IntegerField from django.db.models import OuterRef -from django.db.models import Q from django.db.models import Subquery from django.db.models import Value -from django.db.models import When from django.db.models.functions import Coalesce from django_filters.rest_framework import BooleanFilter from django_filters.rest_framework import CharFilter from le_utils.constants import content_kinds from le_utils.constants import roles +from rest_framework.permissions import IsAuthenticated +from search.models import ContentNodeFullTextSearch +from search.utils import get_fts_search_query from contentcuration.models import Channel -from contentcuration.models import ContentNode from contentcuration.models import File -from contentcuration.utils.pagination import CachedListPagination +from contentcuration.utils.pagination import ValuesViewsetPageNumberPagination +from contentcuration.viewsets.base import ReadOnlyValuesViewset from contentcuration.viewsets.base import RequiredFilterSet from contentcuration.viewsets.common import NotNullMapArrayAgg -from contentcuration.viewsets.common import SQArrayAgg -from contentcuration.viewsets.common import SQCount from contentcuration.viewsets.common import UUIDFilter from contentcuration.viewsets.common import UUIDInFilter -from contentcuration.viewsets.contentnode import ContentNodeViewSet -class ListPagination(CachedListPagination): +class ListPagination(ValuesViewsetPageNumberPagination): page_size = 25 page_size_query_param = "page_size" max_page_size = 100 -uuid_re = re.compile("([a-f0-9]{32})") - - class ContentNodeFilter(RequiredFilterSet): keywords = CharFilter(method="filter_keywords") languages = CharFilter(method="filter_languages") @@ -53,169 +48,127 @@ class ContentNodeFilter(RequiredFilterSet): def filter_channel_list(self, queryset, name, value): user = not self.request.user.is_anonymous and self.request.user channel_ids = [] + if value == "public": - channel_ids = Channel.objects.filter(public=True, deleted=False).values_list("id", flat=True) + channel_ids = Channel.get_public_channels().values_list("id", flat=True) elif value == "edit" and user: - channel_ids = user.editable_channels.values_list("id", flat=True) + channel_ids = user.editable_channels.filter(deleted=False).values_list("id", flat=True) elif value == "bookmark" and user: - channel_ids = user.bookmarked_channels.values_list("id", flat=True) + channel_ids = user.bookmarked_channels.filter(deleted=False).values_list("id", flat=True) elif value == "view" and user: - channel_ids = user.view_only_channels.values_list("id", flat=True) + channel_ids = user.view_only_channels.filter(deleted=False).values_list("id", flat=True) + return queryset.filter(channel_id__in=list(channel_ids)) def filter_keywords(self, queryset, name, value): - filter_query = Q(title__icontains=value) | Q(description__icontains=value) - tags_node_ids = ContentNode.tags.through.objects.filter( - contenttag__tag_name__icontains=value - ).values_list("contentnode_id", flat=True)[:250] - # Check if we have a Kolibri node id or ids and add them to the search if so. - # Add to, rather than replace, the filters so that we never misinterpret a search term as a UUID. - # node_ids = uuid_re.findall(value) + list(tags_node_ids) - node_ids = uuid_re.findall(value) - for node_id in node_ids: - # check for the major ID types - filter_query |= Q(node_id=node_id) - filter_query |= Q(content_id=node_id) - filter_query |= Q(id=node_id) - for node_id in tags_node_ids: - filter_query |= Q(id=node_id) - - return queryset.filter(filter_query) + return queryset.filter(keywords_tsvector=get_fts_search_query(value)) def filter_author(self, queryset, name, value): - return queryset.filter( - Q(author__icontains=value) - | Q(aggregator__icontains=value) - | Q(provider__icontains=value) - ) + return queryset.filter(author_tsvector=get_fts_search_query(value)) def filter_languages(self, queryset, name, value): - return queryset.filter(language__lang_code__in=value.split(",")) + return queryset.filter(contentnode__language__lang_code__in=value.split(",")) def filter_licenses(self, queryset, name, value): licenses = [int(li) for li in value.split(",")] - return queryset.filter(license__in=licenses) + return queryset.filter(contentnode__license__in=licenses) def filter_kinds(self, queryset, name, value): - return queryset.filter(kind_id__in=value.split(",")) + return queryset.filter(contentnode__kind_id__in=value.split(",")) def filter_coach(self, queryset, name, value): - return queryset.filter(role_visibility=roles.COACH) + return queryset.filter(contentnode__role_visibility=roles.COACH) def filter_resources(self, queryset, name, value): - return queryset.exclude(kind_id=content_kinds.TOPIC) + return queryset.exclude(contentnode__kind_id=content_kinds.TOPIC) def filter_assessments(self, queryset, name, value): - return queryset.filter(kind_id=content_kinds.EXERCISE) + return queryset.filter(contentnode__kind_id=content_kinds.EXERCISE) def filter_created_after(self, queryset, name, value): date = re.search(r"(\d{4})-0?(\d+)-(\d+)", value) return queryset.filter( - created__year__gte=date.group(1), - created__month__gte=date.group(2), - created__day__gte=date.group(3), - ) - - class Meta: - model = ContentNode - fields = ( - "keywords", - "languages", - "licenses", - "kinds", - "coach", - "author", - "resources", - "assessments", + contentnode__created__year__gte=date.group(1), + contentnode__created__month__gte=date.group(2), + contentnode__created__day__gte=date.group(3), ) -class SearchContentNodeViewSet(ContentNodeViewSet): +class SearchContentNodeViewSet(ReadOnlyValuesViewset): filterset_class = ContentNodeFilter pagination_class = ListPagination + permission_classes = [IsAuthenticated] + queryset = ContentNodeFullTextSearch.objects.all() + + field_map = { + "id": "contentnode__id", + "content_id": "contentnode__content_id", + "node_id": "contentnode__node_id", + "title": "contentnode__title", + "description": "contentnode__description", + "author": "contentnode__author", + "provider": "contentnode__provider", + "kind__kind": "contentnode__kind__kind", + "thumbnail_encoding": "contentnode__thumbnail_encoding", + "published": "contentnode__published", + "modified": "contentnode__modified", + "parent_id": "contentnode__parent_id", + "changed": "contentnode__changed", + } + values = ( - "id", - "content_id", - "node_id", - "title", - "description", - "author", - "provider", - "kind__kind", + "contentnode__id", + "contentnode__content_id", + "contentnode__node_id", + "contentnode__title", + "contentnode__description", + "contentnode__author", + "contentnode__provider", + "contentnode__kind__kind", + "contentnode__thumbnail_encoding", + "contentnode__published", + "contentnode__modified", + "contentnode__parent_id", + "contentnode__changed", "channel_id", "resource_count", "thumbnail_checksum", "thumbnail_extension", - "thumbnail_encoding", - "published", - "modified", - "parent_id", - "changed", - "location_ids", "content_tags", "original_channel_name", ) def annotate_queryset(self, queryset): """ - 1. Do a distinct by 'content_id,' using the original node if possible - 2. Annotate lists of content node and channel pks + Annotates thumbnails, resources count and original channel name. """ - # Get accessible content nodes that match the content id - content_id_query = ContentNode.filter_view_queryset(ContentNode.objects.all(), self.request.user).filter( - content_id=OuterRef("content_id") - ) - - # Combine by unique content id - deduped_content_query = ( - content_id_query.filter(content_id=OuterRef("content_id")) - .annotate( - is_original=Case( - When(original_source_node_id=F("node_id"), then=Value(1)), - default=Value(2), - output_field=IntegerField(), - ), - ) - .order_by("is_original", "created") - ) - queryset = queryset.filter( - pk__in=Subquery(deduped_content_query.values_list("id", flat=True)[:1]) - ).order_by() - thumbnails = File.objects.filter( - contentnode=OuterRef("id"), preset__thumbnail=True + contentnode=OuterRef("contentnode__id"), preset__thumbnail=True ) - descendant_resources = ( - ContentNode.objects.filter( - tree_id=OuterRef("tree_id"), - lft__gt=OuterRef("lft"), - rght__lt=OuterRef("rght"), - ) - .exclude(kind_id=content_kinds.TOPIC) - .values("id", "role_visibility", "changed") - .order_by() - ) + descendant_resources_count = ExpressionWrapper(((F("contentnode__rght") - F("contentnode__lft") - Value(1)) / Value(2)), output_field=IntegerField()) + original_channel_name = Coalesce( Subquery( - Channel.objects.filter(pk=OuterRef("original_channel_id")).values( + Channel.objects.filter(pk=OuterRef("contentnode__original_channel_id")).values( "name" )[:1] ), Subquery( - Channel.objects.filter(main_tree__tree_id=OuterRef("tree_id")).values( + Channel.objects.filter(main_tree__tree_id=OuterRef("contentnode__tree_id")).values( "name" )[:1] ), ) + queryset = queryset.annotate( - location_ids=SQArrayAgg(content_id_query, field="id"), - resource_count=SQCount(descendant_resources, field="id"), + resource_count=descendant_resources_count, thumbnail_checksum=Subquery(thumbnails.values("checksum")[:1]), thumbnail_extension=Subquery( thumbnails.values("file_format__extension")[:1] ), - content_tags=NotNullMapArrayAgg("tags__tag_name"), + content_tags=NotNullMapArrayAgg("contentnode__tags__tag_name"), original_channel_name=original_channel_name, ) + return queryset diff --git a/requirements-dev.in b/requirements-dev.in index 6c3b1033f8..5d977035f0 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -1,7 +1,7 @@ -c requirements.txt django-concurrent-test-helper==0.7.0 django-debug-panel==0.8.3 -django-debug-toolbar==1.9.1 +django-debug-toolbar==3.5.0 flake8==3.4.1 whitenoise Pympler diff --git a/requirements-dev.txt b/requirements-dev.txt index 383ee77801..f683adee29 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -66,7 +66,7 @@ django-concurrent-test-helper==0.7.0 # via -r requirements-dev.in django-debug-panel==0.8.3 # via -r requirements-dev.in -django-debug-toolbar==1.9.1 +django-debug-toolbar==3.5.0 # via # -r requirements-dev.in # django-debug-panel