From d5151d4d93a0b9f59fb5fe3c0b68e7c9e3562c81 Mon Sep 17 00:00:00 2001 From: "jas.bali" Date: Tue, 7 Oct 2025 12:43:25 -0400 Subject: [PATCH 1/6] Add workflow_with_ai_parse_document example This example demonstrates incremental document processing using: - ai_parse_document for extracting structured data from PDFs/images - ai_query for LLM-based content analysis - Databricks Workflows with Structured Streaming and serverless compute Key features: - Python notebooks with Structured Streaming for incremental processing - Serverless compute for cost efficiency - Parameterized workflow with catalog, schema, and table names - Checkpointed streaming to process only new data - Visual debugging notebook with interactive bounding boxes --- .../.gitignore | 28 + .../workflow_with_ai_parse_document/README.md | 152 ++++ .../databricks.yml | 52 ++ .../ai_parse_document_workflow.job.yml | 55 ++ .../ai_parse_document -- debug output.py | 782 ++++++++++++++++++ .../src/transformations/01_parse_documents.py | 79 ++ .../src/transformations/02_extract_text.py | 72 ++ .../03_extract_structured_data.py | 75 ++ 8 files changed, 1295 insertions(+) create mode 100644 knowledge_base/workflow_with_ai_parse_document/.gitignore create mode 100644 knowledge_base/workflow_with_ai_parse_document/README.md create mode 100644 knowledge_base/workflow_with_ai_parse_document/databricks.yml create mode 100644 knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/transformations/02_extract_text.py create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py diff --git a/knowledge_base/workflow_with_ai_parse_document/.gitignore b/knowledge_base/workflow_with_ai_parse_document/.gitignore new file mode 100644 index 00000000..0707725d --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/.gitignore @@ -0,0 +1,28 @@ +# Databricks +.databricks/ + +# Python +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +*.py[cod] + +# Local configuration (keep your settings private) +databricks.local.yml + +# IDE +.idea/ +.vscode/ +.DS_Store + +# Scratch/temporary files +scratch/** +!scratch/README.md + +# Test documents (don't commit large PDFs) +*.pdf +*.png +*.jpg +*.jpeg diff --git a/knowledge_base/workflow_with_ai_parse_document/README.md b/knowledge_base/workflow_with_ai_parse_document/README.md new file mode 100644 index 00000000..9d32ff70 --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/README.md @@ -0,0 +1,152 @@ +# AI Document Processing Workflow with Structured Streaming + +A Databricks Asset Bundle demonstrating **incremental document processing** using `ai_parse_document`, `ai_query`, and Databricks Workflows with Structured Streaming. + +## Overview + +This example shows how to build an incremental workflow that: +1. **Parses** PDFs and images using [`ai_parse_document`](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document) +2. **Extracts** clean text with incremental processing +3. **Analyzes** content using [`ai_query`](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_query) with LLMs + +All stages run as Python notebook tasks in a Databricks Workflow using Structured Streaming with serverless compute. + +## Architecture + +``` +Source Documents (UC Volume) + ↓ + Task 1: ai_parse_document → parsed_documents_raw (variant) + ↓ + Task 2: text extraction → parsed_documents_text (string) + ↓ + Task 3: ai_query → parsed_documents_structured (json) +``` + +### Key Features + +- **Incremental processing**: Only new files are processed using Structured Streaming checkpoints +- **Serverless compute**: Runs on serverless compute for cost efficiency +- **Task dependencies**: Sequential execution with automatic dependency management +- **Parameterized**: Catalog, schema, volumes, and table names configurable via variables +- **Error handling**: Gracefully handles parsing failures +- **Visual debugging**: Interactive notebook for inspecting results + +## Prerequisites + +- Databricks workspace with Unity Catalog +- Databricks CLI v0.218.0+ +- Unity Catalog volumes for: + - Source documents (PDFs/images) + - Parsed output images + - Streaming checkpoints +- AI functions (`ai_parse_document`, `ai_query`) + +## Quick Start + +1. **Install and authenticate** + ```bash + databricks auth login --host https://your-workspace.cloud.databricks.com + ``` + +2. **Configure** `databricks.yml` with your workspace settings + +3. **Validate** the bundle configuration + ```bash + databricks bundle validate + ``` + +4. **Deploy** + ```bash + databricks bundle deploy + ``` + +5. **Upload documents** to your source volume + +6. **Run workflow** from the Databricks UI (Workflows) + +## Configuration + +Edit `databricks.yml`: + +```yaml +variables: + catalog: main # Your catalog + schema: default # Your schema + source_volume_path: /Volumes/main/default/source_documents # Source PDFs + output_volume_path: /Volumes/main/default/parsed_output # Parsed images + checkpoint_base_path: /tmp/checkpoints/ai_parse_workflow # Checkpoints + raw_table_name: parsed_documents_raw # Table names + text_table_name: parsed_documents_text + structured_table_name: parsed_documents_structured +``` + +## Workflow Tasks + +### Task 1: Document Parsing +**File**: `src/transformations/01_parse_documents.py` + +Uses `ai_parse_document` to extract text, tables, and metadata from PDFs/images: +- Reads files from volume using Structured Streaming +- Stores variant output with bounding boxes +- Incremental: checkpointed streaming prevents reprocessing + +### Task 2: Text Extraction +**File**: `src/transformations/02_extract_text.py` + +Extracts clean concatenated text using `transform()`: +- Reads from previous task's table via streaming +- Handles both parser v1.0 and v2.0 formats +- Uses `transform()` for efficient text extraction +- Includes error handling for failed parses + +### Task 3: AI Query Extraction +**File**: `src/transformations/03_extract_structured_data.py` + +Applies LLM to extract structured insights: +- Reads from text table via streaming +- Uses `ai_query` with Claude Sonnet 4 +- Customizable prompt for domain-specific extraction +- Outputs structured JSON + +## Visual Debugger + +The included notebook visualizes parsing results with interactive bounding boxes. + +**Open**: `src/explorations/ai_parse_document -- debug output.py` + +**Configure widgets**: +- `input_file`: `/Volumes/main/default/source_docs/sample.pdf` +- `image_output_path`: `/Volumes/main/default/parsed_out/` +- `page_selection`: `all` (or `1-3`, `1,5,10`) + +**Features**: +- Color-coded bounding boxes by element type +- Hover tooltips showing extracted content +- Automatic image scaling +- Page selection support + +## Project Structure + +``` +. +├── databricks.yml # Bundle configuration +├── resources/ +│ └── ai_parse_document_workflow.job.yml +├── src/ +│ ├── transformations/ +│ │ ├── 01_parse_documents.py +│ │ ├── 02_extract_text.py +│ │ └── 03_extract_structured_data.py +│ └── explorations/ +│ └── ai_parse_document -- debug output.py +└── README.md +``` + +## Resources + +- [Databricks Asset Bundles](https://docs.databricks.com/dev-tools/bundles/) +- [Databricks Workflows](https://docs.databricks.com/workflows/) +- [Structured Streaming](https://docs.databricks.com/structured-streaming/) +- [`ai_parse_document` Function](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document) +- [`ai_query` Function](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_query) diff --git a/knowledge_base/workflow_with_ai_parse_document/databricks.yml b/knowledge_base/workflow_with_ai_parse_document/databricks.yml new file mode 100644 index 00000000..c8784a9a --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/databricks.yml @@ -0,0 +1,52 @@ +# This is a Databricks asset bundle definition for ai_parse_document_workflow. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: ai_parse_document_workflow + +variables: + catalog: + description: The catalog name for the workflow + default: main + schema: + description: The schema name for the workflow + default: default + source_volume_path: + description: Source volume path for PDF files + default: /Volumes/main/default/source_documents + output_volume_path: + description: Output volume path for processed images + default: /Volumes/main/default/parsed_output + checkpoint_base_path: + description: Base path for Structured Streaming checkpoints + default: /tmp/checkpoints/ai_parse_workflow + raw_table_name: + description: Table name for raw parsed documents + default: parsed_documents_raw + text_table_name: + description: Table name for extracted text + default: parsed_documents_text + structured_table_name: + description: Table name for structured data + default: parsed_documents_structured + +include: + - resources/*.yml + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: https://your-workspace.cloud.databricks.com + + prod: + mode: production + workspace: + host: https://your-workspace.cloud.databricks.com + permissions: + - group_name: users + level: CAN_VIEW diff --git a/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml b/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml new file mode 100644 index 00000000..1a425bf2 --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml @@ -0,0 +1,55 @@ +resources: + jobs: + ai_parse_document_workflow: + name: ai_parse_document_workflow + + environments: + - environment_key: serverless_env + spec: + client: "3" + + tasks: + - task_key: parse_documents + environment_key: serverless_env + notebook_task: + notebook_path: ../src/transformations/01_parse_documents.py + base_parameters: + catalog: ${var.catalog} + schema: ${var.schema} + source_volume_path: ${var.source_volume_path} + output_volume_path: ${var.output_volume_path} + checkpoint_location: ${var.checkpoint_base_path}/01_parse_documents + table_name: ${var.raw_table_name} + + - task_key: extract_text + depends_on: + - task_key: parse_documents + environment_key: serverless_env + notebook_task: + notebook_path: ../src/transformations/02_extract_text.py + base_parameters: + catalog: ${var.catalog} + schema: ${var.schema} + checkpoint_location: ${var.checkpoint_base_path}/02_extract_text + source_table_name: ${var.raw_table_name} + table_name: ${var.text_table_name} + + - task_key: extract_structured_data + depends_on: + - task_key: extract_text + environment_key: serverless_env + notebook_task: + notebook_path: ../src/transformations/03_extract_structured_data.py + base_parameters: + catalog: ${var.catalog} + schema: ${var.schema} + checkpoint_location: ${var.checkpoint_base_path}/03_extract_structured_data + source_table_name: ${var.text_table_name} + table_name: ${var.structured_table_name} + + max_concurrent_runs: 1 + + # Optional: Add a schedule + # schedule: + # quartz_cron_expression: "0 0 * * * ?" + # timezone_id: "UTC" diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py new file mode 100644 index 00000000..2f39afab --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py @@ -0,0 +1,782 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # 🔍 AI Parse Document Debug Interface +# MAGIC +# MAGIC Version 1.3 +# MAGIC +# MAGIC Last update: Oct 6, 2025 +# MAGIC +# MAGIC Changelog: +# MAGIC - Simplified widget parameters: `input_file` and `image_output_path` now accept full volume paths +# MAGIC - Removed separate `catalog`, `schema`, `volume` widgets +# MAGIC - `input_file` supports wildcards for processing multiple files (e.g., `/Volumes/catalog/schema/volume/input/*`) +# MAGIC +# MAGIC ## Overview +# MAGIC This notebook provides a **visual debugging interface** for analyzing the output of Databricks' `ai_parse_document` function. It renders parsed documents with interactive bounding box overlays, allowing you to inspect what content was extracted from each region of your documents. +# MAGIC +# MAGIC ## Features +# MAGIC - 📊 **Visual Bounding Boxes**: Color-coded overlays showing the exact regions where text/elements were detected +# MAGIC - 🎯 **Interactive Tooltips**: Hover over any bounding box to see the parsed content from that region +# MAGIC - 📐 **Automatic Scaling**: Large documents are automatically scaled to fit within 1024px width for optimal viewing +# MAGIC - 🎨 **Element Type Visualization**: Different colors for different element types (text, headers, tables, figures, etc.) +# MAGIC +# MAGIC ## Required Parameters +# MAGIC +# MAGIC This interface requires widget parameters to be configured before running: +# MAGIC +# MAGIC ### 1. `input_file` +# MAGIC - **Description**: Full Unity Catalog volume path to the document(s) you want to parse and visualize +# MAGIC - **Examples**: +# MAGIC - Single file: `/Volumes/catalog/schema/volume/input/document.pdf` +# MAGIC - All files in directory: `/Volumes/catalog/schema/volume/input/*` +# MAGIC - Pattern matching: `/Volumes/catalog/schema/volume/input/*.pdf` +# MAGIC - **Requirements**: Read access to the volume containing your PDF/image files +# MAGIC +# MAGIC ### 2. `image_output_path` +# MAGIC - **Description**: Full Unity Catalog volume path where `ai_parse_document` will store the extracted page images +# MAGIC - **Example**: `/Volumes/catalog/schema/volume/output/` +# MAGIC - **Requirements**: Write access required for storing intermediate image outputs +# MAGIC - **Note**: As documented in the [official Databricks documentation](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document), this path is used by the parsing function to store page images that are referenced in the output +# MAGIC +# MAGIC ### 3. `page_selection` +# MAGIC - **Description**: Specifies which pages to display in the visualization +# MAGIC - **Supported formats**: +# MAGIC - `"all"` or leave empty: Display all pages +# MAGIC - `"3"`: Display only page 3 (1-indexed) +# MAGIC - `"1-5"`: Display pages 1 through 5 (inclusive, 1-indexed) +# MAGIC - `"1,3,5"`: Display specific pages (1-indexed) +# MAGIC - `"1-3,7,10-12"`: Mixed ranges and individual pages +# MAGIC +# MAGIC ## Usage Instructions +# MAGIC +# MAGIC 1. **Clone this notebook** to your workspace: +# MAGIC - Select **"File -> Clone"** button in the top toolbar +# MAGIC - Choose your desired location in your workspace +# MAGIC - This ensures you have a personal copy you can modify and run +# MAGIC +# MAGIC 2. **Prepare your Unity Catalog volumes**: +# MAGIC - Create or identify a volume for your PDF/image files +# MAGIC - Create or identify a volume for output images +# MAGIC - Upload your PDF files to the input location +# MAGIC +# MAGIC 3. **Configure the widget parameters** at the top of this notebook: +# MAGIC - Set `input_file` to the full volume path (file or directory with wildcard) +# MAGIC - Set `image_output_path` to the full volume path for outputs +# MAGIC - Set `page_selection` to control which pages to visualize +# MAGIC +# MAGIC 4. **Run all code cells** which will generate visual debugging results. +# MAGIC +# MAGIC ## What You'll See +# MAGIC +# MAGIC - **Document Summary**: Overview of pages, element counts, and document metadata +# MAGIC - **Color Legend**: Visual guide showing which colors represent which element types +# MAGIC - **Annotated Images**: Each page with overlaid bounding boxes +# MAGIC - Hover over any box to see the extracted content +# MAGIC - Yellow highlight indicates the currently hovered element +# MAGIC - **Parsed Elements List**: Complete list of all extracted elements with their content + +# COMMAND ---------- + +# Exec Parameters + +dbutils.widgets.text("input_file", "/Volumes/main/default/source_documents/sample.pdf") +dbutils.widgets.text("image_output_path", "/Volumes/main/default/parsed_output/") +dbutils.widgets.text("page_selection", "all") + +input_file = dbutils.widgets.get("input_file") +image_output_path = dbutils.widgets.get("image_output_path") +page_selection = dbutils.widgets.get("page_selection") + +# COMMAND ---------- + +# DBTITLE 1,Configuration Parameters +# Path configuration - use widget values as-is + +source_files = input_file + +# Parse page selection string and return list of page indices to display. +# +# Supported formats: +# - "all" or None: Display all pages +# - "3": Display specific page (1-indexed) +# - "1-5": Display page range (inclusive, 1-indexed) +# - "1,3,5": Display list of specific pages (1-indexed) +# - "1-3,7,10-12": Mixed ranges and individual pages +page_selection = f"{page_selection}" + +# COMMAND ---------- + +# DBTITLE 1,Run Document Parse Code (may take some time) +# SQL statement with ai_parse_document() +# Note: input_file can be a single file path or a directory path with wildcard +sql = f''' +with parsed_documents AS ( + SELECT + path, + ai_parse_document(content + , + map( + 'version', '2.0', + 'imageOutputPath', '{image_output_path}', + 'descriptionElementTypes', '*' + ) + ) as parsed + FROM + read_files('{source_files}', format => 'binaryFile') +) +select * from parsed_documents +''' + +parsed_results = [row.parsed for row in spark.sql(sql).collect()] + +# COMMAND ---------- + +import json +from typing import Dict, List, Any, Optional, Tuple, Set, Union +from IPython.display import HTML, display +import base64 +import os +from PIL import Image +import io + +class DocumentRenderer: + def __init__(self): + # Color mapping for different element types + self.element_colors = { + 'section_header': '#FF6B6B', + 'text': '#4ECDC4', + 'figure': '#45B7D1', + 'caption': '#96CEB4', + 'page_footer': '#FFEAA7', + 'page_header': '#DDA0DD', + 'table': '#98D8C8', + 'list': '#F7DC6F', + 'default': '#BDC3C7' + } + + def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]: + """Parse page selection string and return set of page indices (0-based). + + Args: + page_selection: Selection string or None + total_pages: Total number of pages available + + Returns: + Set of 0-based page indices to display + """ + # Handle None or "all" - return all pages + if page_selection is None or page_selection.lower() == "all": + return set(range(total_pages)) + + selected_pages = set() + + # Clean the input + page_selection = page_selection.strip() + + # Split by commas for multiple selections + parts = page_selection.split(',') + + for part in parts: + part = part.strip() + + # Check if it's a range (contains hyphen) + if '-' in part: + try: + # Split range and convert to integers + range_parts = part.split('-') + if len(range_parts) == 2: + start = int(range_parts[0].strip()) + end = int(range_parts[1].strip()) + + # Convert from 1-indexed to 0-indexed + start_idx = start - 1 + end_idx = end - 1 + + # Add all pages in range (inclusive) + for i in range(start_idx, end_idx + 1): + if 0 <= i < total_pages: + selected_pages.add(i) + except ValueError: + print(f"Warning: Invalid range '{part}' in page selection") + else: + # Single page number + try: + page_num = int(part.strip()) + # Convert from 1-indexed to 0-indexed + page_idx = page_num - 1 + if 0 <= page_idx < total_pages: + selected_pages.add(page_idx) + else: + print(f"Warning: Page {page_num} is out of range (1-{total_pages})") + except ValueError: + print(f"Warning: Invalid page number '{part}' in page selection") + + # If no valid pages were selected, default to all pages + if not selected_pages: + print(f"Warning: No valid pages in selection '{page_selection}'. Showing all pages.") + return set(range(total_pages)) + + return selected_pages + + def _get_element_color(self, element_type: str) -> str: + """Get color for element type.""" + return self.element_colors.get(element_type.lower(), self.element_colors['default']) + + def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]: + """Get dimensions of an image file.""" + try: + if os.path.exists(image_path): + with Image.open(image_path) as img: + return img.size # Returns (width, height) + return None + except Exception as e: + print(f"Error getting image dimensions for {image_path}: {e}") + return None + + def _load_image_as_base64(self, image_path: str) -> Optional[str]: + """Load image from file path and convert to base64.""" + try: + if os.path.exists(image_path): + with open(image_path, 'rb') as img_file: + img_data = img_file.read() + img_base64 = base64.b64encode(img_data).decode('utf-8') + ext = os.path.splitext(image_path)[1].lower() + if ext in ['.jpg', '.jpeg']: + return f"data:image/jpeg;base64,{img_base64}" + elif ext in ['.png']: + return f"data:image/png;base64,{img_base64}" + else: + return f"data:image/jpeg;base64,{img_base64}" + return None + except Exception as e: + print(f"Error loading image {image_path}: {e}") + return None + + def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str: + """Render element content with appropriate formatting for both tooltip and element list display. + + Args: + element: The element dictionary containing content/description + for_tooltip: Whether this is for tooltip display (affects styling and truncation) + """ + element_type = element.get('type', 'unknown') + content = element.get('content', '') + description = element.get('description', '') + + display_content = "" + + if content: + if element_type == 'table': + # Render the HTML table with styling + table_html = content + + # Apply different styling based on context + if for_tooltip: + # Compact styling for tooltips with light theme + # Use full width available for tooltip tables + table_style = f'''style="width: 100%; border-collapse: collapse; margin: 5px 0; font-size: 10px;"''' + th_style = 'style="border: 1px solid #ddd; padding: 4px; background: #f8f9fa; color: #333; font-weight: bold; text-align: left; font-size: 10px;"' + td_style = 'style="border: 1px solid #ddd; padding: 4px; color: #333; font-size: 10px;"' + thead_style = 'style="background: #e9ecef;"' + else: + # Full styling for element list + table_style = '''style="width: 100%; border-collapse: collapse; margin: 10px 0; font-size: 13px;"''' + th_style = 'style="border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;"' + td_style = 'style="border: 1px solid #ddd; padding: 8px;"' + thead_style = 'style="background: #f0f0f0;"' + + # Apply styling transformations + if '' in table_html: + table_html = table_html.replace('
', f'
') + if '' in table_html: + table_html = table_html.replace('', f'') + + if for_tooltip: + display_content = table_html + else: + display_content = f"
{table_html}
" + else: + # Regular content handling + if for_tooltip and len(content) > 500: + # Truncate for tooltip display and escape HTML for safety + display_content = self._escape_for_html_attribute(content[:500] + "...") + else: + display_content = self._escape_for_html_attribute(content) if for_tooltip else content + elif description: + desc_content = description + if for_tooltip and len(desc_content) > 500: + desc_content = desc_content[:500] + "..." + + if for_tooltip: + display_content = self._escape_for_html_attribute(f"Description: {desc_content}") + else: + display_content = f"Description: {desc_content}" + else: + display_content = "No content available" if for_tooltip else "No content" + + return display_content + + def _escape_for_html_attribute(self, text: str) -> str: + """Escape text for safe use in HTML attributes.""" + return (text.replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + .replace('\n', '
')) + + def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int: + """Calculate dynamic tooltip width based on table content.""" + element_type = element.get('type', 'unknown') + content = element.get('content', '') + + if element_type == 'table' and content: + # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE) + if first_row_match: + first_row = first_row_match.group(1) + # Count th or td tags + th_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) + td_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) + column_count = max(th_count, td_count) + + if column_count > 0: + # Base width + additional width per column + base_width = 300 + width_per_column = 80 + calculated_width = base_width + (column_count * width_per_column) + + # Cap at 4/5th of image width + max_width = int(image_width * 0.8) + return min(calculated_width, max_width) + + # Default width for non-tables or when calculation fails + return 400 + + def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: + """Create annotated image with SCALING to fit within 1024px width.""" + image_uri = page.get('image_uri', '') + page_id = page.get('id', 0) + + if not image_uri: + return "

No image URI found for this page

" + + # Load image + img_data_uri = self._load_image_as_base64(image_uri) + if not img_data_uri: + return f""" +
+ Could not load image: {image_uri}
+ Make sure the file exists and is accessible. +
+ """ + + # Get original image dimensions + original_dimensions = self._get_image_dimensions(image_uri) + if not original_dimensions: + # Fallback: display without explicit scaling + original_width, original_height = 1024, 768 # Default fallback + else: + original_width, original_height = original_dimensions + + # Calculate scaling factor to fit within 1024px width + max_display_width = 1024 + scale_factor = 1.0 + display_width = original_width + display_height = original_height + + if original_width > max_display_width: + scale_factor = max_display_width / original_width + display_width = max_display_width + display_height = int(original_height * scale_factor) + + # Filter elements for this page and collect their bounding boxes + page_elements = [] + + for elem in elements: + elem_bboxes = [] + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) == page_id: + coord = bbox.get('coord', []) + if len(coord) >= 4: + elem_bboxes.append(bbox) + + if elem_bboxes: + page_elements.append({ + 'element': elem, + 'bboxes': elem_bboxes + }) + + if not page_elements: + return f"

No elements found for page {page_id}

" + + header_info = f""" +
+ Page {page_id + 1}: {len(page_elements)} elements
+ Original size: {original_width}×{original_height}px | + Display size: {display_width}×{display_height}px | + Scale factor: {scale_factor:.3f}
+
+ """ + + # Generate unique container ID for this page + container_id = f"page_container_{page_id}_{id(self)}" + + # Create bounding box overlays using SCALED coordinates with hover functionality + overlays = [] + + for idx, item in enumerate(page_elements): + element = item['element'] + element_id = element.get('id', 'N/A') + element_type = element.get('type', 'unknown') + color = self._get_element_color(element_type) + + # Use the shared content renderer for tooltip + tooltip_content = self._render_element_content(element, for_tooltip=True) + + # Calculate dynamic tooltip width + tooltip_width = self._calculate_tooltip_width(element, display_width) + + # Tables should render as HTML, other content should be escaped + + for bbox_idx, bbox in enumerate(item['bboxes']): + coord = bbox.get('coord', []) + if len(coord) >= 4: + x1, y1, x2, y2 = coord + + # Apply scaling to coordinates + scaled_x1 = x1 * scale_factor + scaled_y1 = y1 * scale_factor + scaled_x2 = x2 * scale_factor + scaled_y2 = y2 * scale_factor + + width = scaled_x2 - scaled_x1 + height = scaled_y2 - scaled_y1 + + # Skip invalid boxes + if width <= 0 or height <= 0: + continue + + # Position label above box when possible + label_top = -18 if scaled_y1 >= 18 else 2 + + # Unique ID for this bounding box + box_id = f"bbox_{page_id}_{idx}_{bbox_idx}" + + # Calculate tooltip position (prefer right side, but switch to left if needed) + tooltip_left = 10 + + overlay = f""" +
+
+ {element_type.upper()[:6]}#{element_id} +
+ +
+
+ {element_type.upper()} #{element_id} +
+
+ {tooltip_content} +
+
+
+ """ + overlays.append(overlay) + + # Pure CSS hover functionality (works in Databricks) + styles = f""" + + """ + + return f""" + {header_info} + {styles} +
+ Page {page_id + 1} + {''.join(overlays)} +
+ """ + + def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str: + """Create a detailed list of elements for a specific page.""" + # Filter elements for this page + page_elements = [] + + for elem in elements: + elem_bboxes = [] + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) == page_id: + elem_bboxes.append(bbox) + + if elem_bboxes: + page_elements.append(elem) + + if not page_elements: + return f"

No elements found for page {page_id + 1}

" + + html_parts = [] + + for element in page_elements: + element_id = element.get('id', 'N/A') + element_type = element.get('type', 'unknown') + color = self._get_element_color(element_type) + + # Get bounding box info for this page only + bbox_info = "No bbox" + bbox_list = element.get('bbox', []) + if bbox_list: + bbox_details = [] + for bbox in bbox_list: + if bbox.get('page_id', 0) == page_id: + coord = bbox.get('coord', []) + if len(coord) >= 4: + bbox_details.append(f"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]") + bbox_info = "; ".join(bbox_details) if bbox_details else "Invalid bbox" + + # Use the shared content renderer for element list display + display_content = self._render_element_content(element, for_tooltip=False) + + element_html = f""" +
+
+

+ {element_type.upper().replace('_', ' ')} (ID: {element_id}) +

+ + {bbox_info} + +
+
+ {display_content} +
+
+ """ + html_parts.append(element_html) + + return f""" +
+

📋 Page {page_id + 1} Elements ({len(page_elements)} items)

+ {''.join(html_parts)} +
+ """ + + def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str: + """Create a summary with page selection info.""" + elements = document.get('elements', []) + + # Count elements only on selected pages + selected_elements = [] + for elem in elements: + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) in selected_pages: + selected_elements.append(elem) + break + + # Count by type (for selected pages) + type_counts = {} + for elem in selected_elements: + elem_type = elem.get('type', 'unknown') + type_counts[elem_type] = type_counts.get(elem_type, 0) + 1 + + type_list = ', '.join([f"{t}: {c}" for t, c in type_counts.items()]) + + # Create page selection info + if len(selected_pages) == total_pages: + page_info = f"All {total_pages} pages" + else: + # Convert to 1-indexed for display + page_nums = sorted([p + 1 for p in selected_pages]) + if len(page_nums) <= 10: + page_info = f"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})" + else: + page_info = f"{len(selected_pages)} of {total_pages} pages selected" + + return f""" +
+

📄 Document Summary

+

Displaying: {page_info}

+

Elements on selected pages: {len(selected_elements)}

+

Element Types: {type_list if type_list else 'None'}

+

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

+
+ """ + + def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None: + """Main render function with page selection support. + + Args: + parsed_result: The parsed document result + page_selection: Page selection string. Supported formats: + - "all" or None: Display all pages + - "3": Display only page 3 (1-indexed) + - "1-5": Display pages 1 through 5 (inclusive) + - "1,3,5": Display specific pages + - "1-3,7,10-12": Mixed format + """ + try: + # Convert to dict + if hasattr(parsed_result, 'toPython'): + parsed_dict = parsed_result.toPython() + elif hasattr(parsed_result, 'toJson'): + parsed_dict = json.loads(parsed_result.toJson()) + elif isinstance(parsed_result, dict): + parsed_dict = parsed_result + else: + display(HTML(f"

❌ Could not convert result. Type: {type(parsed_result)}

")) + return + + # Extract components + document = parsed_dict.get('document', {}) + pages = document.get('pages', []) + elements = document.get('elements', []) + metadata = parsed_dict.get('metadata', {}) + + if not elements: + display(HTML("

❌ No elements found in document

")) + return + + # Parse page selection + selected_pages = self._parse_page_selection(page_selection, len(pages)) + + # Display title + display(HTML("

🔍 AI Parse Document Results

")) + + # Display summary with page selection info + summary_html = self._create_summary(document, metadata, selected_pages, len(pages)) + display(HTML(summary_html)) + + # Display color legend + legend_items = [] + for elem_type, color in self.element_colors.items(): + if elem_type != 'default': + legend_items.append(f""" + + + {elem_type.replace('_', ' ').title()} + + """) + + display(HTML(f""" +
+ 🎨 Element Colors:
+ {''.join(legend_items)} +
+ """)) + + # Display annotated images with their corresponding elements (filtered by selection) + if pages: + display(HTML("

🖼️ Annotated Images & Elements

")) + + # Sort selected pages for display + sorted_selected = sorted(selected_pages) + + for page_idx in sorted_selected: + if page_idx < len(pages): + page = pages[page_idx] + + # Display the annotated image + annotated_html = self._create_annotated_image(page, elements) + display(HTML(f"
{annotated_html}
")) + + # Display elements for this page immediately after the image + page_id = page.get('id', page_idx) + page_elements_html = self._create_page_elements_list(page_id, elements) + display(HTML(page_elements_html)) + + except Exception as e: + display(HTML(f"

❌ Error: {str(e)}

")) + import traceback + display(HTML(f"
{traceback.format_exc()}
")) + + +# Simple usage functions +def render_ai_parse_output(parsed_result, page_selection=None): + """Simple function to render ai_parse_document output with page selection. + + Args: + parsed_result: The parsed document result + page_selection: Optional page selection string. Examples: + - None or "all": Display all pages + - "3": Display only page 3 + - "1-5": Display pages 1 through 5 + - "1,3,5": Display specific pages + - "1-3,7,10-12": Mixed format + """ + renderer = DocumentRenderer() + renderer.render_document(parsed_result, page_selection) + +# COMMAND ---------- + +# DBTITLE 1,Debug Visualization Results +for parsed_result in parsed_results: + render_ai_parse_output(parsed_result, page_selection) \ No newline at end of file diff --git a/knowledge_base/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py b/knowledge_base/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py new file mode 100644 index 00000000..ba1fc506 --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py @@ -0,0 +1,79 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Parse Documents using ai_parse_document +# MAGIC +# MAGIC This notebook uses Structured Streaming to incrementally parse PDFs and images using the ai_parse_document function. + +# COMMAND ---------- + +# Get parameters +dbutils.widgets.text("catalog", "users", "Catalog name") +dbutils.widgets.text("schema", "jas_bali", "Schema name") +dbutils.widgets.text("source_volume_path", "/Volumes/users/jas_bali/pdfs_ie", "Source volume path") +dbutils.widgets.text("output_volume_path", "/Volumes/users/jas_bali/pdfs_for_bricks", "Output volume path") +dbutils.widgets.text("checkpoint_location", "/tmp/checkpoints/parse_documents", "Checkpoint location") +dbutils.widgets.text("table_name", "parsed_documents_raw", "Output table name") + +catalog = dbutils.widgets.get("catalog") +schema = dbutils.widgets.get("schema") +source_volume_path = dbutils.widgets.get("source_volume_path") +output_volume_path = dbutils.widgets.get("output_volume_path") +checkpoint_location = dbutils.widgets.get("checkpoint_location") +table_name = dbutils.widgets.get("table_name") + +# COMMAND ---------- + +# Set catalog and schema +spark.sql(f"USE CATALOG {catalog}") +spark.sql(f"USE SCHEMA {schema}") + +# COMMAND ---------- + +from pyspark.sql.functions import col, current_timestamp, expr +from pyspark.sql.types import StructType, StructField, StringType, BinaryType, TimestampType, LongType + +# Define schema for binary files (must match exact schema expected by binaryFile format) +binary_file_schema = StructType([ + StructField("path", StringType(), False), + StructField("modificationTime", TimestampType(), False), + StructField("length", LongType(), False), + StructField("content", BinaryType(), True) +]) + +# Read files using Structured Streaming +files_df = (spark.readStream + .format("binaryFile") + .schema(binary_file_schema) + .option("pathGlobFilter", "*.{pdf,jpg,jpeg,png}") + .load(source_volume_path) +) + +# Parse documents with ai_parse_document +parsed_df = (files_df + .repartition(8, expr("crc32(path) % 8")) + .withColumn("parsed", + expr(f""" + ai_parse_document( + content, + map( + 'version', '2.0', + 'imageOutputPath', '{output_volume_path}', + 'descriptionElementTypes', '*' + ) + ) + """) + ) + .withColumn("parsed_at", current_timestamp()) + .select("path", "parsed", "parsed_at") +) + +# Write to Delta table with streaming +(parsed_df.writeStream + .format("delta") + .outputMode("append") + .option("checkpointLocation", checkpoint_location) + .option("delta.feature.variantType-preview", "supported") + .option("mergeSchema", "true") + .trigger(availableNow=True) + .toTable(table_name) +) diff --git a/knowledge_base/workflow_with_ai_parse_document/src/transformations/02_extract_text.py b/knowledge_base/workflow_with_ai_parse_document/src/transformations/02_extract_text.py new file mode 100644 index 00000000..5e2d46c3 --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/transformations/02_extract_text.py @@ -0,0 +1,72 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Extract Text from Parsed Documents +# MAGIC +# MAGIC This notebook uses Structured Streaming to extract clean text from parsed documents. + +# COMMAND ---------- + +# Get parameters +dbutils.widgets.text("catalog", "users", "Catalog name") +dbutils.widgets.text("schema", "jas_bali", "Schema name") +dbutils.widgets.text("checkpoint_location", "/tmp/checkpoints/extract_text", "Checkpoint location") +dbutils.widgets.text("source_table_name", "parsed_documents_raw", "Source table name") +dbutils.widgets.text("table_name", "parsed_documents_text", "Output table name") + +catalog = dbutils.widgets.get("catalog") +schema = dbutils.widgets.get("schema") +checkpoint_location = dbutils.widgets.get("checkpoint_location") +source_table_name = dbutils.widgets.get("source_table_name") +table_name = dbutils.widgets.get("table_name") + +# COMMAND ---------- + +# Set catalog and schema +spark.sql(f"USE CATALOG {catalog}") +spark.sql(f"USE SCHEMA {schema}") + +# COMMAND ---------- + +from pyspark.sql.functions import col, concat_ws, expr, lit, when + +# Read from source table using Structured Streaming +parsed_stream = (spark.readStream + .format("delta") + .table(source_table_name) +) + +# Extract text from parsed documents +text_df = parsed_stream.withColumn( + "text", + when( + expr("try_cast(parsed:error_status AS STRING)").isNotNull(), + lit(None) + ).otherwise( + concat_ws( + "\n\n", + expr(""" + transform( + CASE + WHEN try_cast(parsed:metadata:version AS STRING) = '1.0' + THEN try_cast(parsed:document:pages AS ARRAY) + ELSE try_cast(parsed:document:elements AS ARRAY) + END, + element -> try_cast(element:content AS STRING) + ) + """) + ) + ) +).withColumn( + "error_status", + expr("try_cast(parsed:error_status AS STRING)") +).select("path", "text", "error_status", "parsed_at") + +# Write to Delta table with streaming +(text_df.writeStream + .format("delta") + .outputMode("append") + .option("checkpointLocation", checkpoint_location) + .option("mergeSchema", "true") + .trigger(availableNow=True) + .toTable(table_name) +) diff --git a/knowledge_base/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py b/knowledge_base/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py new file mode 100644 index 00000000..f5088bbb --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py @@ -0,0 +1,75 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Extract Structured Data using AI Query +# MAGIC +# MAGIC This notebook uses Structured Streaming to extract structured JSON from document text using ai_query. + +# COMMAND ---------- + +# Get parameters +dbutils.widgets.text("catalog", "users", "Catalog name") +dbutils.widgets.text("schema", "jas_bali", "Schema name") +dbutils.widgets.text("checkpoint_location", "/tmp/checkpoints/extract_structured", "Checkpoint location") +dbutils.widgets.text("source_table_name", "parsed_documents_text", "Source table name") +dbutils.widgets.text("table_name", "parsed_documents_structured", "Output table name") + +catalog = dbutils.widgets.get("catalog") +schema = dbutils.widgets.get("schema") +checkpoint_location = dbutils.widgets.get("checkpoint_location") +source_table_name = dbutils.widgets.get("source_table_name") +table_name = dbutils.widgets.get("table_name") + +# COMMAND ---------- + +# Set catalog and schema +spark.sql(f"USE CATALOG {catalog}") +spark.sql(f"USE SCHEMA {schema}") + +# COMMAND ---------- + +from pyspark.sql.functions import col, concat, current_timestamp, expr, length, lit + +# Read from source table using Structured Streaming +text_stream = (spark.readStream + .format("delta") + .table(source_table_name) + .filter( + (col("text").isNotNull()) & + (col("error_status").isNull()) & + (length(col("text")) > 100) + ) +) + +# Extract structured data using ai_query +structured_df = text_stream.withColumn( + "extracted_json", + expr(""" + ai_query( + 'databricks-claude-sonnet-4', + concat( + 'Extract key information from this document and return as JSON. ', + 'Include: document_type, key_entities (names, organizations, locations), ', + 'dates, amounts, and a brief summary (max 100 words). ', + 'Document text: ', + text + ), + returnType => 'STRING', + modelParameters => named_struct( + 'max_tokens', 2000, + 'temperature', 0.1 + ) + ) + """) +).withColumn( + "extraction_timestamp", current_timestamp() +).select("path", "extracted_json", "parsed_at", "extraction_timestamp") + +# Write to Delta table with streaming +(structured_df.writeStream + .format("delta") + .outputMode("append") + .option("checkpointLocation", checkpoint_location) + .option("mergeSchema", "true") + .trigger(availableNow=True) + .toTable(table_name) +) From 63ec6949c116f2d4b965ca0ec39d5660134ab871 Mon Sep 17 00:00:00 2001 From: "jas.bali" Date: Mon, 13 Oct 2025 19:18:37 -0400 Subject: [PATCH 2/6] Address PR feedback for workflow example - Add job-level parameters block for catalog and schema (shared across all tasks) - Move optional schedule configuration to top of job definition - Replace Python notebook with Jupyter notebook format including visual outputs --- .../ai_parse_document_workflow.job.yml | 23 +- .../ai_parse_document -- debug output.ipynb | 7486 +++++++++++++++++ .../ai_parse_document -- debug output.py | 782 -- 3 files changed, 7498 insertions(+), 793 deletions(-) create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb delete mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py diff --git a/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml b/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml index 1a425bf2..83997b15 100644 --- a/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml +++ b/knowledge_base/workflow_with_ai_parse_document/resources/ai_parse_document_workflow.job.yml @@ -3,6 +3,18 @@ resources: ai_parse_document_workflow: name: ai_parse_document_workflow + # Optional: Add a schedule + # schedule: + # quartz_cron_expression: "0 0 * * * ?" + # timezone_id: "UTC" + + # Job-level parameters shared across all tasks + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + environments: - environment_key: serverless_env spec: @@ -14,8 +26,6 @@ resources: notebook_task: notebook_path: ../src/transformations/01_parse_documents.py base_parameters: - catalog: ${var.catalog} - schema: ${var.schema} source_volume_path: ${var.source_volume_path} output_volume_path: ${var.output_volume_path} checkpoint_location: ${var.checkpoint_base_path}/01_parse_documents @@ -28,8 +38,6 @@ resources: notebook_task: notebook_path: ../src/transformations/02_extract_text.py base_parameters: - catalog: ${var.catalog} - schema: ${var.schema} checkpoint_location: ${var.checkpoint_base_path}/02_extract_text source_table_name: ${var.raw_table_name} table_name: ${var.text_table_name} @@ -41,15 +49,8 @@ resources: notebook_task: notebook_path: ../src/transformations/03_extract_structured_data.py base_parameters: - catalog: ${var.catalog} - schema: ${var.schema} checkpoint_location: ${var.checkpoint_base_path}/03_extract_structured_data source_table_name: ${var.text_table_name} table_name: ${var.structured_table_name} max_concurrent_runs: 1 - - # Optional: Add a schedule - # schedule: - # quartz_cron_expression: "0 0 * * * ?" - # timezone_id: "UTC" diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb new file mode 100644 index 00000000..be7155ee --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb @@ -0,0 +1,7486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b1fe924b-46e4-4cd2-ac59-712283f9af38", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# \uD83D\uDD0D AI Parse Document Debug Interface\n", + "\n", + "Version 1.3\n", + "\n", + "Last update: Oct 6, 2025\n", + "\n", + "Changelog:\n", + "- Simplified widget parameters: `input_file` and `image_output_path` now accept full volume paths\n", + "- Removed separate `catalog`, `schema`, `volume` widgets\n", + "- `input_file` supports wildcards for processing multiple files (e.g., `/Volumes/catalog/schema/volume/input/*`)\n", + "\n", + "## Overview\n", + "This notebook provides a **visual debugging interface** for analyzing the output of Databricks' `ai_parse_document` function. It renders parsed documents with interactive bounding box overlays, allowing you to inspect what content was extracted from each region of your documents.\n", + "\n", + "## Features\n", + "- \uD83D\uDCCA **Visual Bounding Boxes**: Color-coded overlays showing the exact regions where text/elements were detected\n", + "- \uD83C\uDFAF **Interactive Tooltips**: Hover over any bounding box to see the parsed content from that region\n", + "- \uD83D\uDCD0 **Automatic Scaling**: Large documents are automatically scaled to fit within 1024px width for optimal viewing\n", + "- \uD83C\uDFA8 **Element Type Visualization**: Different colors for different element types (text, headers, tables, figures, etc.)\n", + "\n", + "## Required Parameters\n", + "\n", + "This interface requires widget parameters to be configured before running:\n", + "\n", + "### 1. `input_file`\n", + "- **Description**: Full Unity Catalog volume path to the document(s) you want to parse and visualize\n", + "- **Examples**:\n", + " - Single file: `/Volumes/catalog/schema/volume/input/document.pdf`\n", + " - All files in directory: `/Volumes/catalog/schema/volume/input/*`\n", + " - Pattern matching: `/Volumes/catalog/schema/volume/input/*.pdf`\n", + "- **Requirements**: Read access to the volume containing your PDF/image files\n", + "\n", + "### 2. `image_output_path`\n", + "- **Description**: Full Unity Catalog volume path where `ai_parse_document` will store the extracted page images\n", + "- **Example**: `/Volumes/catalog/schema/volume/output/`\n", + "- **Requirements**: Write access required for storing intermediate image outputs\n", + "- **Note**: As documented in the [official Databricks documentation](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document), this path is used by the parsing function to store page images that are referenced in the output\n", + "\n", + "### 3. `page_selection`\n", + "- **Description**: Specifies which pages to display in the visualization\n", + "- **Supported formats**:\n", + " - `\"all\"` or leave empty: Display all pages\n", + " - `\"3\"`: Display only page 3 (1-indexed)\n", + " - `\"1-5\"`: Display pages 1 through 5 (inclusive, 1-indexed)\n", + " - `\"1,3,5\"`: Display specific pages (1-indexed)\n", + " - `\"1-3,7,10-12\"`: Mixed ranges and individual pages\n", + "\n", + "## Usage Instructions\n", + "\n", + "1. **Clone this notebook** to your workspace:\n", + " - Select **\"File -> Clone\"** button in the top toolbar\n", + " - Choose your desired location in your workspace\n", + " - This ensures you have a personal copy you can modify and run\n", + "\n", + "2. **Prepare your Unity Catalog volumes**:\n", + " - Create or identify a volume for your PDF/image files\n", + " - Create or identify a volume for output images\n", + " - Upload your PDF files to the input location\n", + "\n", + "3. **Configure the widget parameters** at the top of this notebook:\n", + " - Set `input_file` to the full volume path (file or directory with wildcard)\n", + " - Set `image_output_path` to the full volume path for outputs\n", + " - Set `page_selection` to control which pages to visualize\n", + "\n", + "4. **Run all code cells** which will generate visual debugging results.\n", + "\n", + "## What You'll See\n", + "\n", + "- **Document Summary**: Overview of pages, element counts, and document metadata\n", + "- **Color Legend**: Visual guide showing which colors represent which element types\n", + "- **Annotated Images**: Each page with overlaid bounding boxes\n", + " - Hover over any box to see the extracted content\n", + " - Yellow highlight indicates the currently hovered element\n", + "- **Parsed Elements List**: Complete list of all extracted elements with their content" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2763f01c-f6f2-47b4-9a0a-bfeccfd131ca", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Exec Parameters\n", + "\n", + "dbutils.widgets.text(\"input_file\", \"/Volumes/main/default/source_documents/sample.pdf\")\n", + "dbutils.widgets.text(\"image_output_path\", \"/Volumes/main/default/parsed_output/\")\n", + "dbutils.widgets.text(\"page_selection\", \"all\")\n", + "\n", + "input_file = dbutils.widgets.get(\"input_file\")\n", + "image_output_path = dbutils.widgets.get(\"image_output_path\")\n", + "page_selection = dbutils.widgets.get(\"page_selection\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cbdcca83-af8c-427b-b82a-33d5cd73b998", + "showTitle": true, + "tableResultSettingsMap": {}, + "title": "Configuration Parameters" + } + }, + "outputs": [], + "source": [ + "# Path configuration - use widget values as-is\n", + "\n", + "source_files = input_file\n", + "\n", + "# Parse page selection string and return list of page indices to display.\n", + "#\n", + "# Supported formats:\n", + "# - \"all\" or None: Display all pages\n", + "# - \"3\": Display specific page (1-indexed)\n", + "# - \"1-5\": Display page range (inclusive, 1-indexed)\n", + "# - \"1,3,5\": Display list of specific pages (1-indexed)\n", + "# - \"1-3,7,10-12\": Mixed ranges and individual pages\n", + "page_selection = f\"{page_selection}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8eb28801-7c66-4d5f-9ee6-66d8e10bd45e", + "showTitle": true, + "tableResultSettingsMap": {}, + "title": "Run Document Parse Code (may take some time)" + } + }, + "outputs": [], + "source": [ + "# SQL statement with ai_parse_document()\n", + "# Note: input_file can be a single file path or a directory path with wildcard\n", + "sql = f'''\n", + "with parsed_documents AS (\n", + " SELECT\n", + " path,\n", + " ai_parse_document(content\n", + " ,\n", + " map(\n", + " 'version', '2.0',\n", + " 'imageOutputPath', '{image_output_path}',\n", + " 'descriptionElementTypes', '*'\n", + " )\n", + " ) as parsed\n", + " FROM\n", + " read_files('{source_files}', format => 'binaryFile')\n", + ")\n", + "select * from parsed_documents\n", + "'''\n", + "\n", + "parsed_results = [row.parsed for row in spark.sql(sql).collect()]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3770f490-a617-46f9-9904-e85249dc0f33", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import json\n", + "from typing import Dict, List, Any, Optional, Tuple, Set, Union\n", + "from IPython.display import HTML, display\n", + "import base64\n", + "import os\n", + "from PIL import Image\n", + "import io\n", + "\n", + "class DocumentRenderer:\n", + " def __init__(self):\n", + " # Color mapping for different element types\n", + " self.element_colors = {\n", + " 'section_header': '#FF6B6B',\n", + " 'text': '#4ECDC4', \n", + " 'figure': '#45B7D1',\n", + " 'caption': '#96CEB4',\n", + " 'page_footer': '#FFEAA7',\n", + " 'page_header': '#DDA0DD',\n", + " 'table': '#98D8C8',\n", + " 'list': '#F7DC6F',\n", + " 'default': '#BDC3C7'\n", + " }\n", + " \n", + " def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]:\n", + " \"\"\"Parse page selection string and return set of page indices (0-based).\n", + " \n", + " Args:\n", + " page_selection: Selection string or None\n", + " total_pages: Total number of pages available\n", + " \n", + " Returns:\n", + " Set of 0-based page indices to display\n", + " \"\"\"\n", + " # Handle None or \"all\" - return all pages\n", + " if page_selection is None or page_selection.lower() == \"all\":\n", + " return set(range(total_pages))\n", + " \n", + " selected_pages = set()\n", + " \n", + " # Clean the input\n", + " page_selection = page_selection.strip()\n", + " \n", + " # Split by commas for multiple selections\n", + " parts = page_selection.split(',')\n", + " \n", + " for part in parts:\n", + " part = part.strip()\n", + " \n", + " # Check if it's a range (contains hyphen)\n", + " if '-' in part:\n", + " try:\n", + " # Split range and convert to integers\n", + " range_parts = part.split('-')\n", + " if len(range_parts) == 2:\n", + " start = int(range_parts[0].strip())\n", + " end = int(range_parts[1].strip())\n", + " \n", + " # Convert from 1-indexed to 0-indexed\n", + " start_idx = start - 1\n", + " end_idx = end - 1\n", + " \n", + " # Add all pages in range (inclusive)\n", + " for i in range(start_idx, end_idx + 1):\n", + " if 0 <= i < total_pages:\n", + " selected_pages.add(i)\n", + " except ValueError:\n", + " print(f\"Warning: Invalid range '{part}' in page selection\")\n", + " else:\n", + " # Single page number\n", + " try:\n", + " page_num = int(part.strip())\n", + " # Convert from 1-indexed to 0-indexed\n", + " page_idx = page_num - 1\n", + " if 0 <= page_idx < total_pages:\n", + " selected_pages.add(page_idx)\n", + " else:\n", + " print(f\"Warning: Page {page_num} is out of range (1-{total_pages})\")\n", + " except ValueError:\n", + " print(f\"Warning: Invalid page number '{part}' in page selection\")\n", + " \n", + " # If no valid pages were selected, default to all pages\n", + " if not selected_pages:\n", + " print(f\"Warning: No valid pages in selection '{page_selection}'. Showing all pages.\")\n", + " return set(range(total_pages))\n", + " \n", + " return selected_pages\n", + " \n", + " def _get_element_color(self, element_type: str) -> str:\n", + " \"\"\"Get color for element type.\"\"\"\n", + " return self.element_colors.get(element_type.lower(), self.element_colors['default'])\n", + " \n", + " def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]:\n", + " \"\"\"Get dimensions of an image file.\"\"\"\n", + " try:\n", + " if os.path.exists(image_path):\n", + " with Image.open(image_path) as img:\n", + " return img.size # Returns (width, height)\n", + " return None\n", + " except Exception as e:\n", + " print(f\"Error getting image dimensions for {image_path}: {e}\")\n", + " return None\n", + " \n", + " def _load_image_as_base64(self, image_path: str) -> Optional[str]:\n", + " \"\"\"Load image from file path and convert to base64.\"\"\"\n", + " try:\n", + " if os.path.exists(image_path):\n", + " with open(image_path, 'rb') as img_file:\n", + " img_data = img_file.read()\n", + " img_base64 = base64.b64encode(img_data).decode('utf-8')\n", + " ext = os.path.splitext(image_path)[1].lower()\n", + " if ext in ['.jpg', '.jpeg']:\n", + " return f\"data:image/jpeg;base64,{img_base64}\"\n", + " elif ext in ['.png']:\n", + " return f\"data:image/png;base64,{img_base64}\"\n", + " else:\n", + " return f\"data:image/jpeg;base64,{img_base64}\"\n", + " return None\n", + " except Exception as e:\n", + " print(f\"Error loading image {image_path}: {e}\")\n", + " return None\n", + " \n", + " def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str:\n", + " \"\"\"Render element content with appropriate formatting for both tooltip and element list display.\n", + " \n", + " Args:\n", + " element: The element dictionary containing content/description\n", + " for_tooltip: Whether this is for tooltip display (affects styling and truncation)\n", + " \"\"\"\n", + " element_type = element.get('type', 'unknown')\n", + " content = element.get('content', '')\n", + " description = element.get('description', '')\n", + " \n", + " display_content = \"\"\n", + " \n", + " if content:\n", + " if element_type == 'table':\n", + " # Render the HTML table with styling\n", + " table_html = content\n", + " \n", + " # Apply different styling based on context\n", + " if for_tooltip:\n", + " # Compact styling for tooltips with light theme\n", + " # Use full width available for tooltip tables\n", + " table_style = f'''style=\"width: 100%; border-collapse: collapse; margin: 5px 0; font-size: 10px;\"'''\n", + " th_style = 'style=\"border: 1px solid #ddd; padding: 4px; background: #f8f9fa; color: #333; font-weight: bold; text-align: left; font-size: 10px;\"'\n", + " td_style = 'style=\"border: 1px solid #ddd; padding: 4px; color: #333; font-size: 10px;\"'\n", + " thead_style = 'style=\"background: #e9ecef;\"'\n", + " else:\n", + " # Full styling for element list\n", + " table_style = '''style=\"width: 100%; border-collapse: collapse; margin: 10px 0; font-size: 13px;\"'''\n", + " th_style = 'style=\"border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;\"'\n", + " td_style = 'style=\"border: 1px solid #ddd; padding: 8px;\"'\n", + " thead_style = 'style=\"background: #f0f0f0;\"'\n", + " \n", + " # Apply styling transformations\n", + " if '
' in table_html: + table_html = table_html.replace('', f'') + if '' in table_html: + table_html = table_html.replace('', f'') + if '
or tags in first row + import re + + # Find first row (either in thead or tbody) + first_row_match = re.search(r']*>(.*?)
' in table_html:\n", + " table_html = table_html.replace('
', f'
')\n", + " if '' in table_html:\n", + " table_html = table_html.replace('', f'')\n", + " \n", + " if for_tooltip:\n", + " display_content = table_html\n", + " else:\n", + " display_content = f\"
{table_html}
\"\n", + " else:\n", + " # Regular content handling\n", + " if for_tooltip and len(content) > 500:\n", + " # Truncate for tooltip display and escape HTML for safety\n", + " display_content = self._escape_for_html_attribute(content[:500] + \"...\")\n", + " else:\n", + " display_content = self._escape_for_html_attribute(content) if for_tooltip else content\n", + " elif description:\n", + " desc_content = description\n", + " if for_tooltip and len(desc_content) > 500:\n", + " desc_content = desc_content[:500] + \"...\"\n", + " \n", + " if for_tooltip:\n", + " display_content = self._escape_for_html_attribute(f\"Description: {desc_content}\")\n", + " else:\n", + " display_content = f\"Description: {desc_content}\"\n", + " else:\n", + " display_content = \"No content available\" if for_tooltip else \"No content\"\n", + " \n", + " return display_content\n", + " \n", + " def _escape_for_html_attribute(self, text: str) -> str:\n", + " \"\"\"Escape text for safe use in HTML attributes.\"\"\"\n", + " return (text.replace('&', '&')\n", + " .replace('<', '<')\n", + " .replace('>', '>')\n", + " .replace('\"', '"')\n", + " .replace(\"'\", ''')\n", + " .replace('\\n', '
'))\n", + " \n", + " def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int:\n", + " \"\"\"Calculate dynamic tooltip width based on table content.\"\"\"\n", + " element_type = element.get('type', 'unknown')\n", + " content = element.get('content', '')\n", + " \n", + " if element_type == 'table' and content:\n", + " # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE)\n", + " if first_row_match:\n", + " first_row = first_row_match.group(1)\n", + " # Count th or td tags\n", + " th_count = len(re.findall(r']*>', first_row, re.IGNORECASE))\n", + " td_count = len(re.findall(r']*>', first_row, re.IGNORECASE))\n", + " column_count = max(th_count, td_count)\n", + " \n", + " if column_count > 0:\n", + " # Base width + additional width per column\n", + " base_width = 300\n", + " width_per_column = 80\n", + " calculated_width = base_width + (column_count * width_per_column)\n", + " \n", + " # Cap at 4/5th of image width\n", + " max_width = int(image_width * 0.8)\n", + " return min(calculated_width, max_width)\n", + " \n", + " # Default width for non-tables or when calculation fails\n", + " return 400\n", + " \n", + " def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str:\n", + " \"\"\"Create annotated image with SCALING to fit within 1024px width.\"\"\"\n", + " image_uri = page.get('image_uri', '')\n", + " page_id = page.get('id', 0)\n", + " \n", + " if not image_uri:\n", + " return \"

No image URI found for this page

\"\n", + " \n", + " # Load image\n", + " img_data_uri = self._load_image_as_base64(image_uri)\n", + " if not img_data_uri:\n", + " return f\"\"\"\n", + "
\n", + " Could not load image: {image_uri}
\n", + " Make sure the file exists and is accessible.\n", + "
\n", + " \"\"\"\n", + " \n", + " # Get original image dimensions\n", + " original_dimensions = self._get_image_dimensions(image_uri)\n", + " if not original_dimensions:\n", + " # Fallback: display without explicit scaling\n", + " original_width, original_height = 1024, 768 # Default fallback\n", + " else:\n", + " original_width, original_height = original_dimensions\n", + " \n", + " # Calculate scaling factor to fit within 1024px width\n", + " max_display_width = 1024\n", + " scale_factor = 1.0\n", + " display_width = original_width\n", + " display_height = original_height\n", + " \n", + " if original_width > max_display_width:\n", + " scale_factor = max_display_width / original_width\n", + " display_width = max_display_width\n", + " display_height = int(original_height * scale_factor)\n", + " \n", + " # Filter elements for this page and collect their bounding boxes\n", + " page_elements = []\n", + " \n", + " for elem in elements:\n", + " elem_bboxes = []\n", + " for bbox in elem.get('bbox', []):\n", + " if bbox.get('page_id', 0) == page_id:\n", + " coord = bbox.get('coord', [])\n", + " if len(coord) >= 4:\n", + " elem_bboxes.append(bbox)\n", + " \n", + " if elem_bboxes:\n", + " page_elements.append({\n", + " 'element': elem,\n", + " 'bboxes': elem_bboxes\n", + " })\n", + " \n", + " if not page_elements:\n", + " return f\"

No elements found for page {page_id}

\"\n", + " \n", + " header_info = f\"\"\"\n", + "
\n", + " Page {page_id + 1}: {len(page_elements)} elements
\n", + " Original size: {original_width}×{original_height}px | \n", + " Display size: {display_width}×{display_height}px | \n", + " Scale factor: {scale_factor:.3f}
\n", + "
\n", + " \"\"\"\n", + " \n", + " # Generate unique container ID for this page\n", + " container_id = f\"page_container_{page_id}_{id(self)}\"\n", + " \n", + " # Create bounding box overlays using SCALED coordinates with hover functionality\n", + " overlays = []\n", + " \n", + " for idx, item in enumerate(page_elements):\n", + " element = item['element']\n", + " element_id = element.get('id', 'N/A')\n", + " element_type = element.get('type', 'unknown')\n", + " color = self._get_element_color(element_type)\n", + " \n", + " # Use the shared content renderer for tooltip\n", + " tooltip_content = self._render_element_content(element, for_tooltip=True)\n", + " \n", + " # Calculate dynamic tooltip width\n", + " tooltip_width = self._calculate_tooltip_width(element, display_width)\n", + " \n", + " # Tables should render as HTML, other content should be escaped\n", + " \n", + " for bbox_idx, bbox in enumerate(item['bboxes']):\n", + " coord = bbox.get('coord', [])\n", + " if len(coord) >= 4:\n", + " x1, y1, x2, y2 = coord\n", + " \n", + " # Apply scaling to coordinates\n", + " scaled_x1 = x1 * scale_factor\n", + " scaled_y1 = y1 * scale_factor\n", + " scaled_x2 = x2 * scale_factor\n", + " scaled_y2 = y2 * scale_factor\n", + " \n", + " width = scaled_x2 - scaled_x1\n", + " height = scaled_y2 - scaled_y1\n", + " \n", + " # Skip invalid boxes\n", + " if width <= 0 or height <= 0:\n", + " continue\n", + " \n", + " # Position label above box when possible\n", + " label_top = -18 if scaled_y1 >= 18 else 2\n", + " \n", + " # Unique ID for this bounding box\n", + " box_id = f\"bbox_{page_id}_{idx}_{bbox_idx}\"\n", + " \n", + " # Calculate tooltip position (prefer right side, but switch to left if needed)\n", + " tooltip_left = 10\n", + " \n", + " overlay = f\"\"\"\n", + "
\n", + "
\n", + " {element_type.upper()[:6]}#{element_id}\n", + "
\n", + " \n", + "
\n", + "
\n", + " {element_type.upper()} #{element_id}\n", + "
\n", + "
\n", + " {tooltip_content}\n", + "
\n", + "
\n", + "
\n", + " \"\"\"\n", + " overlays.append(overlay)\n", + " \n", + " # Pure CSS hover functionality (works in Databricks)\n", + " styles = f\"\"\"\n", + " \n", + " \"\"\"\n", + " \n", + " return f\"\"\"\n", + " {header_info}\n", + " {styles}\n", + "
\n", + " \"Page\n", + " {''.join(overlays)}\n", + "
\n", + " \"\"\"\n", + " \n", + " def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str:\n", + " \"\"\"Create a detailed list of elements for a specific page.\"\"\"\n", + " # Filter elements for this page\n", + " page_elements = []\n", + " \n", + " for elem in elements:\n", + " elem_bboxes = []\n", + " for bbox in elem.get('bbox', []):\n", + " if bbox.get('page_id', 0) == page_id:\n", + " elem_bboxes.append(bbox)\n", + " \n", + " if elem_bboxes:\n", + " page_elements.append(elem)\n", + " \n", + " if not page_elements:\n", + " return f\"

No elements found for page {page_id + 1}

\"\n", + " \n", + " html_parts = []\n", + " \n", + " for element in page_elements:\n", + " element_id = element.get('id', 'N/A')\n", + " element_type = element.get('type', 'unknown')\n", + " color = self._get_element_color(element_type)\n", + " \n", + " # Get bounding box info for this page only\n", + " bbox_info = \"No bbox\"\n", + " bbox_list = element.get('bbox', [])\n", + " if bbox_list:\n", + " bbox_details = []\n", + " for bbox in bbox_list:\n", + " if bbox.get('page_id', 0) == page_id:\n", + " coord = bbox.get('coord', [])\n", + " if len(coord) >= 4:\n", + " bbox_details.append(f\"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]\")\n", + " bbox_info = \"; \".join(bbox_details) if bbox_details else \"Invalid bbox\"\n", + " \n", + " # Use the shared content renderer for element list display\n", + " display_content = self._render_element_content(element, for_tooltip=False)\n", + " \n", + " element_html = f\"\"\"\n", + "
\n", + "
\n", + "

\n", + " {element_type.upper().replace('_', ' ')} (ID: {element_id})\n", + "

\n", + " \n", + " {bbox_info}\n", + " \n", + "
\n", + "
\n", + " {display_content}\n", + "
\n", + "
\n", + " \"\"\"\n", + " html_parts.append(element_html)\n", + " \n", + " return f\"\"\"\n", + "
\n", + "

\uD83D\uDCCB Page {page_id + 1} Elements ({len(page_elements)} items)

\n", + " {''.join(html_parts)}\n", + "
\n", + " \"\"\"\n", + " \n", + " def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str:\n", + " \"\"\"Create a summary with page selection info.\"\"\"\n", + " elements = document.get('elements', [])\n", + " \n", + " # Count elements only on selected pages\n", + " selected_elements = []\n", + " for elem in elements:\n", + " for bbox in elem.get('bbox', []):\n", + " if bbox.get('page_id', 0) in selected_pages:\n", + " selected_elements.append(elem)\n", + " break\n", + " \n", + " # Count by type (for selected pages)\n", + " type_counts = {}\n", + " for elem in selected_elements:\n", + " elem_type = elem.get('type', 'unknown')\n", + " type_counts[elem_type] = type_counts.get(elem_type, 0) + 1\n", + " \n", + " type_list = ', '.join([f\"{t}: {c}\" for t, c in type_counts.items()])\n", + " \n", + " # Create page selection info\n", + " if len(selected_pages) == total_pages:\n", + " page_info = f\"All {total_pages} pages\"\n", + " else:\n", + " # Convert to 1-indexed for display\n", + " page_nums = sorted([p + 1 for p in selected_pages])\n", + " if len(page_nums) <= 10:\n", + " page_info = f\"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})\"\n", + " else:\n", + " page_info = f\"{len(selected_pages)} of {total_pages} pages selected\"\n", + " \n", + " return f\"\"\"\n", + "
\n", + "

\uD83D\uDCC4 Document Summary

\n", + "

Displaying: {page_info}

\n", + "

Elements on selected pages: {len(selected_elements)}

\n", + "

Element Types: {type_list if type_list else 'None'}

\n", + "

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

\n", + "
\n", + " \"\"\"\n", + " \n", + " def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None:\n", + " \"\"\"Main render function with page selection support.\n", + " \n", + " Args:\n", + " parsed_result: The parsed document result\n", + " page_selection: Page selection string. Supported formats:\n", + " - \"all\" or None: Display all pages\n", + " - \"3\": Display only page 3 (1-indexed)\n", + " - \"1-5\": Display pages 1 through 5 (inclusive)\n", + " - \"1,3,5\": Display specific pages\n", + " - \"1-3,7,10-12\": Mixed format\n", + " \"\"\"\n", + " try:\n", + " # Convert to dict\n", + " if hasattr(parsed_result, 'toPython'):\n", + " parsed_dict = parsed_result.toPython()\n", + " elif hasattr(parsed_result, 'toJson'):\n", + " parsed_dict = json.loads(parsed_result.toJson())\n", + " elif isinstance(parsed_result, dict):\n", + " parsed_dict = parsed_result\n", + " else:\n", + " display(HTML(f\"

❌ Could not convert result. Type: {type(parsed_result)}

\"))\n", + " return\n", + " \n", + " # Extract components\n", + " document = parsed_dict.get('document', {})\n", + " pages = document.get('pages', [])\n", + " elements = document.get('elements', [])\n", + " metadata = parsed_dict.get('metadata', {})\n", + " \n", + " if not elements:\n", + " display(HTML(\"

❌ No elements found in document

\"))\n", + " return\n", + " \n", + " # Parse page selection\n", + " selected_pages = self._parse_page_selection(page_selection, len(pages))\n", + " \n", + " # Display title\n", + " display(HTML(\"

\uD83D\uDD0D AI Parse Document Results

\"))\n", + " \n", + " # Display summary with page selection info\n", + " summary_html = self._create_summary(document, metadata, selected_pages, len(pages))\n", + " display(HTML(summary_html))\n", + " \n", + " # Display color legend\n", + " legend_items = []\n", + " for elem_type, color in self.element_colors.items():\n", + " if elem_type != 'default':\n", + " legend_items.append(f\"\"\"\n", + " \n", + " \n", + " {elem_type.replace('_', ' ').title()}\n", + " \n", + " \"\"\")\n", + " \n", + " display(HTML(f\"\"\"\n", + "
\n", + " \uD83C\uDFA8 Element Colors:
\n", + " {''.join(legend_items)}\n", + "
\n", + " \"\"\"))\n", + " \n", + " # Display annotated images with their corresponding elements (filtered by selection)\n", + " if pages:\n", + " display(HTML(\"

\uD83D\uDDBC️ Annotated Images & Elements

\"))\n", + " \n", + " # Sort selected pages for display\n", + " sorted_selected = sorted(selected_pages)\n", + " \n", + " for page_idx in sorted_selected:\n", + " if page_idx < len(pages):\n", + " page = pages[page_idx]\n", + " \n", + " # Display the annotated image\n", + " annotated_html = self._create_annotated_image(page, elements)\n", + " display(HTML(f\"
{annotated_html}
\"))\n", + " \n", + " # Display elements for this page immediately after the image\n", + " page_id = page.get('id', page_idx)\n", + " page_elements_html = self._create_page_elements_list(page_id, elements)\n", + " display(HTML(page_elements_html))\n", + " \n", + " except Exception as e:\n", + " display(HTML(f\"

❌ Error: {str(e)}

\"))\n", + " import traceback\n", + " display(HTML(f\"
{traceback.format_exc()}
\"))\n", + "\n", + "\n", + "# Simple usage functions\n", + "def render_ai_parse_output(parsed_result, page_selection=None):\n", + " \"\"\"Simple function to render ai_parse_document output with page selection.\n", + " \n", + " Args:\n", + " parsed_result: The parsed document result\n", + " page_selection: Optional page selection string. Examples:\n", + " - None or \"all\": Display all pages\n", + " - \"3\": Display only page 3\n", + " - \"1-5\": Display pages 1 through 5\n", + " - \"1,3,5\": Display specific pages\n", + " - \"1-3,7,10-12\": Mixed format\n", + " \"\"\"\n", + " renderer = DocumentRenderer()\n", + " renderer.render_document(parsed_result, page_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7498a4a3-7443-4503-bd13-10014953e73b", + "showTitle": true, + "tableResultSettingsMap": {}, + "title": "Debug Visualization Results" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

\uD83D\uDD0D AI Parse Document Results

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCC4 Document Summary

\n", + "

Displaying: Pages 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 (10 of 78)

\n", + "

Elements on selected pages: 84

\n", + "

Element Types: text: 33, title: 1, figure: 18, page_header: 18, section_header: 12, table: 1, caption: 1

\n", + "

Document ID: 3a07bf7a-e00...

\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + " \uD83C\uDFA8 Element Colors:
\n", + " \n", + " \n", + " \n", + " Section Header\n", + " \n", + " \n", + " \n", + " \n", + " Text\n", + " \n", + " \n", + " \n", + " \n", + " Figure\n", + " \n", + " \n", + " \n", + " \n", + " Caption\n", + " \n", + " \n", + " \n", + " \n", + " Page Footer\n", + " \n", + " \n", + " \n", + " \n", + " Page Header\n", + " \n", + " \n", + " \n", + " \n", + " Table\n", + " \n", + " \n", + " \n", + " \n", + " List\n", + " \n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

\uD83D\uDDBC️ Annotated Images & Elements

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 1: 11 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " TEXT#0\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #0\n", + "
\n", + "
\n", + " eBook\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TITLE#1\n", + "
\n", + " \n", + "
\n", + "
\n", + " TITLE #1\n", + "
\n", + "
\n", + " The Big Book of MLOps\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#2\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #2\n", + "
\n", + "
\n", + " NOW INCLUDING A SECTION ON LLMOPS\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#3\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #3\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" alongside a stylized, geometric design composed of red and white triangles.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#4\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #4\n", + "
\n", + "
\n", + " 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#5\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #5\n", + "
\n", + "
\n", + " Description: Two teal and green hexagonal icons display stylized figures and abstract circular designs connected by lines.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#6\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #6\n", + "
\n", + "
\n", + " Description: An orange hexagon contains a circular design featuring a gear, wrench, and a person icon.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#7\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #7\n", + "
\n", + "
\n", + " MODELOPS\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#8\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #8\n", + "
\n", + "
\n", + " DATAOPS\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#9\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #9\n", + "
\n", + "
\n", + " DEVOPS\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#10\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #10\n", + "
\n", + "
\n", + " JOSEPH BRADLEY RAFI KURLANSIK MATT THOMSON NIALL TURBITT\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 1 Elements (11 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 0)\n", + "

\n", + " \n", + " [118, 323, 208, 353]\n", + " \n", + "
\n", + "
\n", + " eBook\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TITLE (ID: 1)\n", + "

\n", + " \n", + " [118, 374, 659, 587]\n", + " \n", + "
\n", + "
\n", + " The Big Book of MLOps\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 2)\n", + "

\n", + " \n", + " [118, 647, 457, 724]\n", + " \n", + "
\n", + "
\n", + " NOW INCLUDING A SECTION ON LLMOPS\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 3)\n", + "

\n", + " \n", + " [120, 1186, 358, 1226]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" alongside a stylized, geometric design composed of red and white triangles.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 4)\n", + "

\n", + " \n", + " [1531, 39, 1674, 181]\n", + " \n", + "
\n", + "
\n", + " 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 5)\n", + "

\n", + " \n", + " [655, 570, 1131, 793]\n", + " \n", + "
\n", + "
\n", + " Description: Two teal and green hexagonal icons display stylized figures and abstract circular designs connected by lines.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 6)\n", + "

\n", + " \n", + " [1211, 570, 1409, 793]\n", + " \n", + "
\n", + "
\n", + " Description: An orange hexagon contains a circular design featuring a gear, wrench, and a person icon.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 7)\n", + "

\n", + " \n", + " [684, 874, 813, 896]\n", + " \n", + "
\n", + "
\n", + " MODELOPS\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 8)\n", + "

\n", + " \n", + " [977, 874, 1083, 896]\n", + " \n", + "
\n", + "
\n", + " DATAOPS\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 9)\n", + "

\n", + " \n", + " [1259, 874, 1356, 896]\n", + " \n", + "
\n", + "
\n", + " DEVOPS\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 10)\n", + "

\n", + " \n", + " [853, 1201, 1565, 1220]\n", + " \n", + "
\n", + "
\n", + " JOSEPH BRADLEY RAFI KURLANSIK MATT THOMSON NIALL TURBITT\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 2: 5 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#11\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #11\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#12\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #12\n", + "
\n", + "
\n", + " 2\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#13\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #13\n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#14\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #14\n", + "
\n", + "
\n", + " CHAPTER 1 Introduction ......<DOT_LEADER> 5
CHAPTER 2 Big Book of MLOps V1 Recap ......<DOT_LEADER> 6
Why should I care about MLOps? ......<DOT_LEADER> 6
Guiding principles ......<DOT_LEADER> 6
Semantics of development, staging and production ......<DOT_LEADER> 7
ML deployment patterns ......<DOT_LEADER> 8
CHAPTER 3 What's New? ......<DOT_LEADER> 10
Unity Catalog ......<DOT_LEADER> 10
Benefits and architecture implications ......<DOT_LEADER> 11
Model Serving ......<DOT_LEADER> 13
Benefits and ar...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#15\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #15\n", + "
\n", + "
\n", + " Description: The image displays the Databricks logo, featuring a stylized red and gray geometric design alongside the company name in bold text.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 2 Elements (5 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 11)\n", + "

\n", + " \n", + " [59, 61, 347, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 12)\n", + "

\n", + " \n", + " [1634, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " 2\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 13)\n", + "

\n", + " \n", + " [59, 231, 297, 287]\n", + " \n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 14)\n", + "

\n", + " \n", + " [487, 257, 1601, 1056]\n", + " \n", + "
\n", + "
\n", + " CHAPTER 1 Introduction ...... 5\n", + "CHAPTER 2 Big Book of MLOps V1 Recap ...... 6\n", + "Why should I care about MLOps? ...... 6\n", + "Guiding principles ...... 6\n", + "Semantics of development, staging and production ...... 7\n", + "ML deployment patterns ...... 8\n", + "CHAPTER 3 What's New? ...... 10\n", + "Unity Catalog ...... 10\n", + "Benefits and architecture implications ...... 11\n", + "Model Serving ...... 13\n", + "Benefits and architecture implications ...... 13\n", + "Lakehouse Monitoring ...... 15\n", + "Benefits and architecture implications ...... 15\n", + "CHAPTER 4 Design Decisions ...... 17\n", + "Unity Catalog ...... 17\n", + "Organizing data and Al assets ...... 17\n", + "Concepts ...... 18\n", + "Considerations ...... 21\n", + "Recommended organization ...... 23\n", + "Model Serving...... 27\n", + "Pre-deployment testing ...... 28\n", + "Real-time model deployment ...... 29\n", + "Implementing in Databricks ...... 30\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 15)\n", + "

\n", + " \n", + " [59, 1181, 322, 1226]\n", + " \n", + "
\n", + "
\n", + " Description: The image displays the Databricks logo, featuring a stylized red and gray geometric design alongside the company name in bold text.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 3: 5 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#16\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #16\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#17\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #17\n", + "
\n", + "
\n", + " 3\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#18\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #18\n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TABLE#19\n", + "
\n", + " \n", + "
\n", + "
\n", + " TABLE #19\n", + "
\n", + "
\n", + "
' in table_html:\n", + " table_html = table_html.replace('', f'')\n", + " if '' in table_html:\n", + " table_html = table_html.replace('', f'')\n", + " if '
or tags in first row\n", + " import re\n", + " \n", + " # Find first row (either in thead or tbody)\n", + " first_row_match = re.search(r']*>(.*?)
CHAPTER 5Reference Architecture31
Multi-environment view
Development34
Data35
Exploratory data analysis (EDA)35
Project code36
Model training development36
Model validation and deployment development37
Commit code38
Staging39
Data40
Merge code40
Integration tests (CI)40
Merge41
Cut release branch41
Production42
Model training44
Model validation45
Model deployment46
Model Serving48
Inference: batch or streaming48
Lakehouse Monitoring49
Retraining49
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " FIGURE#20\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #20\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" in a stylized font accompanied by a red, geometric icon to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 3 Elements (5 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 16)\n", + "

\n", + " \n", + " [59, 61, 347, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 17)\n", + "

\n", + " \n", + " [1634, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " 3\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 18)\n", + "

\n", + " \n", + " [59, 231, 301, 287]\n", + " \n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TABLE (ID: 19)\n", + "

\n", + " \n", + " [484, 256, 1603, 967]\n", + " \n", + "
\n", + "
\n", + "
CHAPTER 5Reference Architecture31
Multi-environment view
Development34
Data35
Exploratory data analysis (EDA)35
Project code36
Model training development36
Model validation and deployment development37
Commit code38
Staging39
Data40
Merge code40
Integration tests (CI)40
Merge41
Cut release branch41
Production42
Model training44
Model validation45
Model deployment46
Model Serving48
Inference: batch or streaming48
Lakehouse Monitoring49
Retraining49
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 20)\n", + "

\n", + " \n", + " [51, 1183, 326, 1227]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" in a stylized font accompanied by a red, geometric icon to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 4: 5 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#21\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #21\n", + "
\n", + "
\n", + " BIG BOOK OF MLOP'S - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#22\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #22\n", + "
\n", + "
\n", + " 4\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#23\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #23\n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#24\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #24\n", + "
\n", + "
\n", + " CHAPTER 6 LLMOps 51
What changes with LLMs? 51
Key components of LLM-powered applications 54
Prompt engineering 54
Leveraging your own data 56
Retrieval augmented generation (RAG) 58
Typical RAG workflow 59
Vector database 60
Benefits of vector databases in a RAG workflow 61
Fine-tuning LLMs 62
When to use fine-tuning? 63
Fine-tuning in practice 63
Pre-training 64
When to use pre-training? 64
Pre-training in practice 65
Third-party APIs vs. self-hosted models 66
Model evaluation 67
LLMs as evalu...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#25\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #25\n", + "
\n", + "
\n", + " Description: The image features the Databricks logo, composed of a red diamond and red chevron shapes alongside the company name in black text.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 4 Elements (5 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 21)\n", + "

\n", + " \n", + " [59, 64, 345, 80]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOP'S - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 22)\n", + "

\n", + " \n", + " [1634, 64, 1651, 76]\n", + " \n", + "
\n", + "
\n", + " 4\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 23)\n", + "

\n", + " \n", + " [59, 240, 297, 287]\n", + " \n", + "
\n", + "
\n", + " Contents\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 24)\n", + "

\n", + " \n", + " [484, 259, 1603, 1153]\n", + " \n", + "
\n", + "
\n", + " CHAPTER 6 LLMOps 51\n", + "What changes with LLMs? 51\n", + "Key components of LLM-powered applications 54\n", + "Prompt engineering 54\n", + "Leveraging your own data 56\n", + "Retrieval augmented generation (RAG) 58\n", + "Typical RAG workflow 59\n", + "Vector database 60\n", + "Benefits of vector databases in a RAG workflow 61\n", + "Fine-tuning LLMs 62\n", + "When to use fine-tuning? 63\n", + "Fine-tuning in practice 63\n", + "Pre-training 64\n", + "When to use pre-training? 64\n", + "Pre-training in practice 65\n", + "Third-party APIs vs. self-hosted models 66\n", + "Model evaluation 67\n", + "LLMs as evaluators 69\n", + "Human feedback in evaluation 69\n", + "Packaging models or pipelines for deployment 70\n", + "LLM Inference 71\n", + "Real-time inference 71\n", + "Batch inference 71\n", + "Inference with large models 72\n", + "Managing cost/performance trade-offs 72\n", + "Methods for reducing costs of inference 73\n", + "Reference architecture 74\n", + "RAG with a third-party LLM API 74\n", + "RAG with a fine-tuned OSS model 75\n", + "\n", + "CHAPTER 7 Conclusion 78\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 25)\n", + "

\n", + " \n", + " [59, 1181, 322, 1226]\n", + " \n", + "
\n", + "
\n", + " Description: The image features the Databricks logo, composed of a red diamond and red chevron shapes alongside the company name in black text.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 5: 9 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#26\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #26\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#27\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #27\n", + "
\n", + "
\n", + " 5\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#28\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #28\n", + "
\n", + "
\n", + " CHAPTER 1
Introduction\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#29\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #29\n", + "
\n", + "
\n", + " Machine learning operations (MLOps) is a rapidly evolving field where building and maintaining robust, flexible and efficient workflows is critical. At Databricks, we view MLOps as the set of processes and automation for managing data, code and models to improve performance stability and long-term efficiency in ML systems.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#30\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #30\n", + "
\n", + "
\n", + " MLOps = DataOps + DevOps + ModelOps\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#31\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #31\n", + "
\n", + "
\n", + " Through this lens, we strive to continuously innovate and advance our product offerings to simplify the ability to build AI-powered solutions on the Lakehouse. We believe there is no greater accelerant to delivering ML to production than building on a unified, data-centric AI platform. On Databricks, both data and models can be managed and governed in a single governance solution in the form of Unity Catalog. The previously complex infrastructure required to serve real-time models can now be rep...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#32\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #32\n", + "
\n", + "
\n", + " Perhaps the most significant recent change in the machine learning landscape has been the rapid advancement of generative AI. Generative models such as large language models (LLMs) and image generation models have revolutionized the field, unlocking previously unattainable levels of natural language and image generation. However, their arrival also introduces a new set of challenges and decisions to be made in the context of MLOps.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#33\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #33\n", + "
\n", + "
\n", + " With all these developments in mind, we're excited to present this updated version of the Big Book of MLOps. This guide incorporates new Databricks features such as Models in Unity Catalog, Model Serving, and Lakehouse Monitoring into our MLOps architecture recommendations. We start by outlining the themes that still remain relevant from the previous version of the Big Book of MLOps. Following this, we unpack the new features introduced in this version, their impact on the previous reference arc...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#34\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #34\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 5 Elements (9 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 26)\n", + "

\n", + " \n", + " [59, 61, 345, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 27)\n", + "

\n", + " \n", + " [1634, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " 5\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 28)\n", + "

\n", + " \n", + " [59, 210, 301, 287]\n", + " \n", + "
\n", + "
\n", + " CHAPTER 1\n", + "Introduction\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 29)\n", + "

\n", + " \n", + " [607, 259, 1622, 374]\n", + " \n", + "
\n", + "
\n", + " Machine learning operations (MLOps) is a rapidly evolving field where building and maintaining robust, flexible and efficient workflows is critical. At Databricks, we view MLOps as the set of processes and automation for managing data, code and models to improve performance stability and long-term efficiency in ML systems.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 30)\n", + "

\n", + " \n", + " [853, 433, 1232, 451]\n", + " \n", + "
\n", + "
\n", + " MLOps = DataOps + DevOps + ModelOps\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 31)\n", + "

\n", + " \n", + " [607, 509, 1649, 750]\n", + " \n", + "
\n", + "
\n", + " Through this lens, we strive to continuously innovate and advance our product offerings to simplify the ability to build AI-powered solutions on the Lakehouse. We believe there is no greater accelerant to delivering ML to production than building on a unified, data-centric AI platform. On Databricks, both data and models can be managed and governed in a single governance solution in the form of Unity Catalog. The previously complex infrastructure required to serve real-time models can now be replaced and easily scaled with Databricks Model Serving. Long-term efficiency and performance stability of ML in production can be achieved using Databricks Lakehouse Monitoring. These components collectively form the data pipelines of an ML solution, all of which can be orchestrated using Databricks Workflows.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 32)\n", + "

\n", + " \n", + " [607, 777, 1647, 893]\n", + " \n", + "
\n", + "
\n", + " Perhaps the most significant recent change in the machine learning landscape has been the rapid advancement of generative AI. Generative models such as large language models (LLMs) and image generation models have revolutionized the field, unlocking previously unattainable levels of natural language and image generation. However, their arrival also introduces a new set of challenges and decisions to be made in the context of MLOps.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 33)\n", + "

\n", + " \n", + " [607, 919, 1651, 1161]\n", + " \n", + "
\n", + "
\n", + " With all these developments in mind, we're excited to present this updated version of the Big Book of MLOps. This guide incorporates new Databricks features such as Models in Unity Catalog, Model Serving, and Lakehouse Monitoring into our MLOps architecture recommendations. We start by outlining the themes that still remain relevant from the previous version of the Big Book of MLOps. Following this, we unpack the new features introduced in this version, their impact on the previous reference architecture, and best practices when incorporating these into your MLOps workflows. Next, we present our updated MLOps reference architecture, along with the details of its processes. Finally, we provide guidance for deploying generative AI applications to production on Databricks, focusing on productionizing LLMs.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 34)\n", + "

\n", + " \n", + " [53, 1180, 324, 1229]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 6: 10 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#35\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #35\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#36\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #36\n", + "
\n", + "
\n", + " No content available\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#37\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #37\n", + "
\n", + "
\n", + " CHAPTER 2 Big Book of MLOps V1 Recap\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#38\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #38\n", + "
\n", + "
\n", + " Description: Three teal hexagons display line-art icons representing people, cylinders, and gears, labeled "ModelOps," "DataOps," and "DevOps."\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#39\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #39\n", + "
\n", + "
\n", + " We begin with a brief recap of the core points discussed in the previous version of the Big Book of MLOps. While the recommended reference architecture has evolved due to new features and product updates, the core themes discussed, such as the importance of MLOps, guiding principles and the fundamentals of MLOps on Databricks, remain pertinent. In this section we focus on summarizing those elements that remain unchanged. For a more in-depth discussion of any of these points, we refer the reader ...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#40\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #40\n", + "
\n", + "
\n", + " Why should I care about MLOps?\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#41\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #41\n", + "
\n", + "
\n", + " We continue to stress the importance of defining an effective MLOps strategy. Databricks customers like CareSource, which has since implemented our recommended MLOps architecture, have witnessed firsthand the value this can bring. Through streamlining the process of delivering models to production, time to business value is accelerated. This efficiency has the knock-on effect of giving data science teams the freedom and confidence to transition to subsequent projects without the need for continu...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#42\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #42\n", + "
\n", + "
\n", + " Guiding principles\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#43\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #43\n", + "
\n", + "
\n", + " One guiding principle that continues to lie at the heart of the Lakehouse AI vision is taking a data-centric approach to machine learning. With the increasing prevalence of generative AI, this perspective remains just as important. The core constituents of any ML project can be viewed simply as data pipelines: feature engineering, training, model deployment, inference and monitoring pipelines are all data pipelines. As such, operationalizing an ML solution requires joining data from predictions,...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#44\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #44\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 6 Elements (10 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 35)\n", + "

\n", + " \n", + " [59, 61, 347, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 36)\n", + "

\n", + " \n", + " [1630, 61, 1643, 77]\n", + " \n", + "
\n", + "
\n", + " No content\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 37)\n", + "

\n", + " \n", + " [59, 210, 415, 339]\n", + " \n", + "
\n", + "
\n", + " CHAPTER 2 Big Book of MLOps V1 Recap\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 38)\n", + "

\n", + " \n", + " [59, 533, 522, 733]\n", + " \n", + "
\n", + "
\n", + " Description: Three teal hexagons display line-art icons representing people, cylinders, and gears, labeled \"ModelOps,\" \"DataOps,\" and \"DevOps.\"\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 39)\n", + "

\n", + " \n", + " [609, 256, 1617, 439]\n", + " \n", + "
\n", + "
\n", + " We begin with a brief recap of the core points discussed in the previous version of the Big Book of MLOps. While the recommended reference architecture has evolved due to new features and product updates, the core themes discussed, such as the importance of MLOps, guiding principles and the fundamentals of MLOps on Databricks, remain pertinent. In this section we focus on summarizing those elements that remain unchanged. For a more in-depth discussion of any of these points, we refer the reader to last year's Big Book of MLOps.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 40)\n", + "

\n", + " \n", + " [609, 479, 1083, 511]\n", + " \n", + "
\n", + "
\n", + " Why should I care about MLOps?\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 41)\n", + "

\n", + " \n", + " [609, 527, 1597, 709]\n", + " \n", + "
\n", + "
\n", + " We continue to stress the importance of defining an effective MLOps strategy. Databricks customers like CareSource, which has since implemented our recommended MLOps architecture, have witnessed firsthand the value this can bring. Through streamlining the process of delivering models to production, time to business value is accelerated. This efficiency has the knock-on effect of giving data science teams the freedom and confidence to transition to subsequent projects without the need for continuous manual oversight of models in production.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 42)\n", + "

\n", + " \n", + " [609, 749, 876, 781]\n", + " \n", + "
\n", + "
\n", + " Guiding principles\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 43)\n", + "

\n", + " \n", + " [609, 799, 1601, 1009]\n", + " \n", + "
\n", + "
\n", + " One guiding principle that continues to lie at the heart of the Lakehouse AI vision is taking a data-centric approach to machine learning. With the increasing prevalence of generative AI, this perspective remains just as important. The core constituents of any ML project can be viewed simply as data pipelines: feature engineering, training, model deployment, inference and monitoring pipelines are all data pipelines. As such, operationalizing an ML solution requires joining data from predictions, monitoring and feature tables with other relevant data. Fundamentally, the simplest way to achieve this is to develop AI-powered solutions on the same platform used to manage production data.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 44)\n", + "

\n", + " \n", + " [53, 1180, 324, 1229]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 7: 9 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#45\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #45\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPs - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#46\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #46\n", + "
\n", + "
\n", + " No content available\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#47\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #47\n", + "
\n", + "
\n", + " Semantics of development, staging and production\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#48\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #48\n", + "
\n", + "
\n", + " Description: Three icons representing "Code," "Data," and "Models" are arranged horizontally with labels below each.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#49\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #49\n", + "
\n", + "
\n", + " Note: Throughout this paper we operate under the assumption of three distinct execution environments — development, staging and production — in the form of three separate Databricks workspaces. There can be variations of these three stages, such as alternative naming conventions or splitting staging into separate “test” and “QA” substages. Although not recommended, it is also possible to create three distinct environments within a single Databricks workspace through the use of access controls an...\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#50\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #50\n", + "
\n", + "
\n", + " An ML solution comprises data, code and models. These assets need to be developed, tested (staging) and deployed (production). For each of these stages, we also need to operate within an execution environment. As such, each of data, code, models and execution environments are notionally divided into development, staging and production.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#51\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #51\n", + "
\n", + "
\n", + " Description: Three colored boxes labeled "Developed," "Tested," and "Deployed" are arranged horizontally with corresponding lowercase labels below and arrows pointing right.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#52\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #52\n", + "
\n", + "
\n", + " Each of these stages has distinct access controls and quality guarantees, ranging from the open and exploratory development stage through to the locked-down and quality-assured production stage.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#53\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #53\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" in a bold, sans-serif font accompanied by a red and gray geometric design to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 7 Elements (9 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 45)\n", + "

\n", + " \n", + " [59, 64, 347, 76]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPs - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 46)\n", + "

\n", + " \n", + " [1634, 64, 1651, 76]\n", + " \n", + "
\n", + "
\n", + " No content\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 47)\n", + "

\n", + " \n", + " [607, 261, 1352, 291]\n", + " \n", + "
\n", + "
\n", + " Semantics of development, staging and production\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 48)\n", + "

\n", + " \n", + " [866, 317, 1329, 484]\n", + " \n", + "
\n", + "
\n", + " Description: Three icons representing \"Code,\" \"Data,\" and \"Models\" are arranged horizontally with labels below each.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 49)\n", + "

\n", + " \n", + " [59, 417, 457, 853]\n", + " \n", + "
\n", + "
\n", + " Note: Throughout this paper we operate under the assumption of three distinct execution environments — development, staging and production — in the form of three separate Databricks workspaces. There can be variations of these three stages, such as alternative naming conventions or splitting staging into separate “test” and “QA” substages. Although not recommended, it is also possible to create three distinct environments within a single Databricks workspace through the use of access controls and Git branches. Regardless of how environment separation is achieved, the core principles of the workflow and recommendations presented are generally applicable.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 50)\n", + "

\n", + " \n", + " [607, 546, 1580, 660]\n", + " \n", + "
\n", + "
\n", + " An ML solution comprises data, code and models. These assets need to be developed, tested (staging) and deployed (production). For each of these stages, we also need to operate within an execution environment. As such, each of data, code, models and execution environments are notionally divided into development, staging and production.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 51)\n", + "

\n", + " \n", + " [786, 704, 1409, 921]\n", + " \n", + "
\n", + "
\n", + " Description: Three colored boxes labeled \"Developed,\" \"Tested,\" and \"Deployed\" are arranged horizontally with corresponding lowercase labels below and arrows pointing right.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 52)\n", + "

\n", + " \n", + " [607, 974, 1544, 1024]\n", + " \n", + "
\n", + "
\n", + " Each of these stages has distinct access controls and quality guarantees, ranging from the open and exploratory development stage through to the locked-down and quality-assured production stage.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 53)\n", + "

\n", + " \n", + " [53, 1180, 324, 1226]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" in a bold, sans-serif font accompanied by a red and gray geometric design to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 8: 11 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#54\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #54\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#55\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #55\n", + "
\n", + "
\n", + " 8\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#56\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #56\n", + "
\n", + "
\n", + " ML deployment patterns\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#57\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #57\n", + "
\n", + "
\n", + " Code and models often progress asynchronously through these stages. Thus, it becomes crucial to leverage a solution that allows for the management of model artifacts independently of code, making it possible to update a production model without necessarily making a code change. Data, much like code and models, can be labeled as development, staging or production, indicating not only its origin but also its quality and reliability.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#58\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #58\n", + "
\n", + "
\n", + " Given the independent lifecycles of code and models, there are two opposing strategies to moving code and ML models from development, through staging and subsequently to production:\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#59\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #59\n", + "
\n", + "
\n", + " DEPLOY CODE\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#60\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #60\n", + "
\n", + "
\n", + " Description: Three rectangular boxes, labeled "dev," "staging," and "prod," illustrate a sequential process with arrows and icons representing code and models.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#61\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #61\n", + "
\n", + "
\n", + " - Code for an ML project is developed in the development environment, and this code is then moved to the staging environment, where it is tested. Following successful testing, the project code is deployed to the production environment, where it is executed.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#62\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #62\n", + "
\n", + "
\n", + " - Model training code is tested in the staging environment using a subset of data, and the model training pipeline is executed in the production environment\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#63\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #63\n", + "
\n", + "
\n", + " - The model deployment process of validating a model and additionally conducting comparisons versus any existing production model all run within the production environment\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#64\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #64\n", + "
\n", + "
\n", + " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 8 Elements (11 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 54)\n", + "

\n", + " \n", + " [59, 61, 345, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 55)\n", + "

\n", + " \n", + " [1634, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " 8\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 56)\n", + "

\n", + " \n", + " [607, 259, 969, 293]\n", + " \n", + "
\n", + "
\n", + " ML deployment patterns\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 57)\n", + "

\n", + " \n", + " [607, 309, 1578, 456]\n", + " \n", + "
\n", + "
\n", + " Code and models often progress asynchronously through these stages. Thus, it becomes crucial to leverage a solution that allows for the management of model artifacts independently of code, making it possible to update a production model without necessarily making a code change. Data, much like code and models, can be labeled as development, staging or production, indicating not only its origin but also its quality and reliability.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 58)\n", + "

\n", + " \n", + " [607, 481, 1580, 534]\n", + " \n", + "
\n", + "
\n", + " Given the independent lifecycles of code and models, there are two opposing strategies to moving code and ML models from development, through staging and subsequently to production:\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 59)\n", + "

\n", + " \n", + " [607, 604, 741, 620]\n", + " \n", + "
\n", + "
\n", + " DEPLOY CODE\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 60)\n", + "

\n", + " \n", + " [605, 620, 1554, 826]\n", + " \n", + "
\n", + "
\n", + " Description: Three rectangular boxes, labeled \"dev,\" \"staging,\" and \"prod,\" illustrate a sequential process with arrows and icons representing code and models.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 61)\n", + "

\n", + " \n", + " [638, 899, 1527, 981]\n", + " \n", + "
\n", + "
\n", + " - Code for an ML project is developed in the development environment, and this code is then moved to the staging environment, where it is tested. Following successful testing, the project code is deployed to the production environment, where it is executed.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 62)\n", + "

\n", + " \n", + " [638, 1007, 1540, 1060]\n", + " \n", + "
\n", + "
\n", + " - Model training code is tested in the staging environment using a subset of data, and the model training pipeline is executed in the production environment\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 63)\n", + "

\n", + " \n", + " [638, 1087, 1544, 1139]\n", + " \n", + "
\n", + "
\n", + " - The model deployment process of validating a model and additionally conducting comparisons versus any existing production model all run within the production environment\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 64)\n", + "

\n", + " \n", + " [53, 1180, 324, 1229]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 9: 8 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#65\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #65\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#66\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #66\n", + "
\n", + "
\n", + " No content available\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#67\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #67\n", + "
\n", + "
\n", + " Description: Three boxes labeled "dev," "staging," and "prod" display brain icons with code symbols, connected by dashed arrows.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#68\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #68\n", + "
\n", + "
\n", + " - Model training is executed in the development environment. The produced model artifact is then moved to the staging environment for model validation checks, prior to deployment of the model to the production environment.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#69\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #69\n", + "
\n", + "
\n", + " - This approach requires a separate path for deploying ancillary code such as inference and monitoring code. Subsequently, any pipelines that need to run in the production environment to support the operationalization of the model will necessarily need to go through a separate “deploy code” lifecycle — the code for these components being tested in staging and then deployed to production.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#70\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #70\n", + "
\n", + "
\n", + " - This pattern is typically used when deploying a one-off model, or when model training is expensive and read-access to production data from the development environment is possible\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#71\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #71\n", + "
\n", + "
\n", + " As in our prior paper, we recommend a deploy code approach for the majority of use cases, and the reference architecture presented in this update continues to follow this recommendation.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#72\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #72\n", + "
\n", + "
\n", + " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 9 Elements (8 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 65)\n", + "

\n", + " \n", + " [59, 61, 345, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 66)\n", + "

\n", + " \n", + " [1634, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " No content\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 67)\n", + "

\n", + " \n", + " [602, 253, 1405, 481]\n", + " \n", + "
\n", + "
\n", + " Description: Three boxes labeled \"dev,\" \"staging,\" and \"prod\" display brain icons with code symbols, connected by dashed arrows.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 68)\n", + "

\n", + " \n", + " [640, 530, 1584, 616]\n", + " \n", + "
\n", + "
\n", + " - Model training is executed in the development environment. The produced model artifact is then moved to the staging environment for model validation checks, prior to deployment of the model to the production environment.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 69)\n", + "

\n", + " \n", + " [640, 640, 1573, 789]\n", + " \n", + "
\n", + "
\n", + " - This approach requires a separate path for deploying ancillary code such as inference and monitoring code. Subsequently, any pipelines that need to run in the production environment to support the operationalization of the model will necessarily need to go through a separate “deploy code” lifecycle — the code for these components being tested in staging and then deployed to production.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 70)\n", + "

\n", + " \n", + " [640, 811, 1578, 867]\n", + " \n", + "
\n", + "
\n", + " - This pattern is typically used when deploying a one-off model, or when model training is expensive and read-access to production data from the development environment is possible\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 71)\n", + "

\n", + " \n", + " [607, 907, 1544, 960]\n", + " \n", + "
\n", + "
\n", + " As in our prior paper, we recommend a deploy code approach for the majority of use cases, and the reference architecture presented in this update continues to follow this recommendation.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 72)\n", + "

\n", + " \n", + " [53, 1180, 324, 1229]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " \n", + "
\n", + " Page 10: 11 elements
\n", + " Original size: 1706×1280px | \n", + " Display size: 1024×768px | \n", + " Scale factor: 0.600
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \"Page\n", + " \n", + "
\n", + "
\n", + " PAGE_H#73\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #73\n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_H#74\n", + "
\n", + " \n", + "
\n", + "
\n", + " PAGE_HEADER #74\n", + "
\n", + "
\n", + " 10\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#75\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #75\n", + "
\n", + "
\n", + " CHAPTER 3 What's New?\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#76\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #76\n", + "
\n", + "
\n", + " In this section we outline the key features and product updates introduced into our updated MLOps reference architecture. For each of these, we highlight the benefits they bring and how they impact our end-to-end MLOps workflow.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTIO#77\n", + "
\n", + " \n", + "
\n", + "
\n", + " SECTION_HEADER #77\n", + "
\n", + "
\n", + " Unity Catalog\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#78\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #78\n", + "
\n", + "
\n", + " The Lakehouse forms the foundation of a data-centric AI platform. Key to this is the ability to manage both data and AI assets from a unified governance solution on the Lakehouse. Databricks Unity Catalog enables this by providing centralized access control, auditing, lineage, and data discovery capabilities across Databricks workspaces.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#79\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #79\n", + "
\n", + "
\n", + " These benefits are now extended to MLflow models with the introduction of Models in Unity Catalog. By providing a hosted version of the MLflow Model Registry in Unity Catalog, the full lifecycle of an ML model can be managed while leveraging Unity Catalog's capability to share assets across Databricks workspaces and trace lineage across both data and models.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT#80\n", + "
\n", + " \n", + "
\n", + "
\n", + " TEXT #80\n", + "
\n", + "
\n", + " In addition to managing ML models, feature tables are also a part of Unity Catalog. With Feature Engineering in Unity Catalog, any Delta table in Unity Catalog that has been assigned a primary key (and additionally a timestamp key) can be used as a source of features to train and serve models. Furthermore, feature tables can now also be shared across different workspaces, and lineage recorded between other assets in the Lakehouse.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#81\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #81\n", + "
\n", + "
\n", + " Description: A diagram illustrates a data processing flow with icons representing volumes, tables, features, models, inference tables, and metric tables, connected by arrows.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " CAPTIO#82\n", + "
\n", + " \n", + "
\n", + "
\n", + " CAPTION #82\n", + "
\n", + "
\n", + " Assets of an ML workflow, all managed via Unity Catalog\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE#83\n", + "
\n", + " \n", + "
\n", + "
\n", + " FIGURE #83\n", + "
\n", + "
\n", + " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
\n", + "

\uD83D\uDCCB Page 10 Elements (11 items)

\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 73)\n", + "

\n", + " \n", + " [59, 61, 345, 77]\n", + " \n", + "
\n", + "
\n", + " BIG BOOK OF MLOPS - 2ND EDITION\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " PAGE HEADER (ID: 74)\n", + "

\n", + " \n", + " [1624, 61, 1647, 77]\n", + " \n", + "
\n", + "
\n", + " 10\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 75)\n", + "

\n", + " \n", + " [59, 210, 307, 283]\n", + " \n", + "
\n", + "
\n", + " CHAPTER 3 What's New?\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 76)\n", + "

\n", + " \n", + " [607, 257, 1567, 343]\n", + " \n", + "
\n", + "
\n", + " In this section we outline the key features and product updates introduced into our updated MLOps reference architecture. For each of these, we highlight the benefits they bring and how they impact our end-to-end MLOps workflow.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " SECTION HEADER (ID: 77)\n", + "

\n", + " \n", + " [607, 384, 805, 417]\n", + " \n", + "
\n", + "
\n", + " Unity Catalog\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 78)\n", + "

\n", + " \n", + " [607, 433, 1569, 550]\n", + " \n", + "
\n", + "
\n", + " The Lakehouse forms the foundation of a data-centric AI platform. Key to this is the ability to manage both data and AI assets from a unified governance solution on the Lakehouse. Databricks Unity Catalog enables this by providing centralized access control, auditing, lineage, and data discovery capabilities across Databricks workspaces.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 79)\n", + "

\n", + " \n", + " [607, 574, 1554, 691]\n", + " \n", + "
\n", + "
\n", + " These benefits are now extended to MLflow models with the introduction of Models in Unity Catalog. By providing a hosted version of the MLflow Model Registry in Unity Catalog, the full lifecycle of an ML model can be managed while leveraging Unity Catalog's capability to share assets across Databricks workspaces and trace lineage across both data and models.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " TEXT (ID: 80)\n", + "

\n", + " \n", + " [607, 716, 1567, 864]\n", + " \n", + "
\n", + "
\n", + " In addition to managing ML models, feature tables are also a part of Unity Catalog. With Feature Engineering in Unity Catalog, any Delta table in Unity Catalog that has been assigned a primary key (and additionally a timestamp key) can be used as a source of features to train and serve models. Furthermore, feature tables can now also be shared across different workspaces, and lineage recorded between other assets in the Lakehouse.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 81)\n", + "

\n", + " \n", + " [605, 893, 1603, 1139]\n", + " \n", + "
\n", + "
\n", + " Description: A diagram illustrates a data processing flow with icons representing volumes, tables, features, models, inference tables, and metric tables, connected by arrows.\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " CAPTION (ID: 82)\n", + "

\n", + " \n", + " [607, 1157, 992, 1174]\n", + " \n", + "
\n", + "
\n", + " Assets of an ML workflow, all managed via Unity Catalog\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "

\n", + " FIGURE (ID: 83)\n", + "

\n", + " \n", + " [53, 1180, 324, 1229]\n", + " \n", + "
\n", + "
\n", + " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for parsed_result in parsed_results:\n", + " render_ai_parse_output(parsed_result, page_selection)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": { + "hardware": { + "accelerator": null, + "gpuPoolId": null, + "memory": null + } + }, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "3" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "ai_parse_document -- debug output", + "widgets": { + "image_output_path": { + "currentValue": "/Volumes/users/jas_bali/pdfs_for_bricks/", + "nuid": "b024b356-873a-4d79-9498-c33aa08ab48a", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "/Volumes/main/default/parsed_output/", + "label": null, + "name": "image_output_path", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "/Volumes/main/default/parsed_output/", + "label": null, + "name": "image_output_path", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, + "input_file": { + "currentValue": "/Volumes/users/jas_bali/pdfs_ie/2023-10-EB-Big-Book-of-MLOps-2nd-Edition.pdf", + "nuid": "c8c4fa93-30c4-4462-b585-256a6b1d60d2", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "/Volumes/main/default/source_documents/sample.pdf", + "label": null, + "name": "input_file", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "/Volumes/main/default/source_documents/sample.pdf", + "label": null, + "name": "input_file", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, + "page_selection": { + "currentValue": "1-10", + "nuid": "cd86904e-36de-4980-a660-304943552e8d", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "all", + "label": null, + "name": "page_selection", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "all", + "label": null, + "name": "page_selection", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + } + } + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py deleted file mode 100644 index 2f39afab..00000000 --- a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py +++ /dev/null @@ -1,782 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # 🔍 AI Parse Document Debug Interface -# MAGIC -# MAGIC Version 1.3 -# MAGIC -# MAGIC Last update: Oct 6, 2025 -# MAGIC -# MAGIC Changelog: -# MAGIC - Simplified widget parameters: `input_file` and `image_output_path` now accept full volume paths -# MAGIC - Removed separate `catalog`, `schema`, `volume` widgets -# MAGIC - `input_file` supports wildcards for processing multiple files (e.g., `/Volumes/catalog/schema/volume/input/*`) -# MAGIC -# MAGIC ## Overview -# MAGIC This notebook provides a **visual debugging interface** for analyzing the output of Databricks' `ai_parse_document` function. It renders parsed documents with interactive bounding box overlays, allowing you to inspect what content was extracted from each region of your documents. -# MAGIC -# MAGIC ## Features -# MAGIC - 📊 **Visual Bounding Boxes**: Color-coded overlays showing the exact regions where text/elements were detected -# MAGIC - 🎯 **Interactive Tooltips**: Hover over any bounding box to see the parsed content from that region -# MAGIC - 📐 **Automatic Scaling**: Large documents are automatically scaled to fit within 1024px width for optimal viewing -# MAGIC - 🎨 **Element Type Visualization**: Different colors for different element types (text, headers, tables, figures, etc.) -# MAGIC -# MAGIC ## Required Parameters -# MAGIC -# MAGIC This interface requires widget parameters to be configured before running: -# MAGIC -# MAGIC ### 1. `input_file` -# MAGIC - **Description**: Full Unity Catalog volume path to the document(s) you want to parse and visualize -# MAGIC - **Examples**: -# MAGIC - Single file: `/Volumes/catalog/schema/volume/input/document.pdf` -# MAGIC - All files in directory: `/Volumes/catalog/schema/volume/input/*` -# MAGIC - Pattern matching: `/Volumes/catalog/schema/volume/input/*.pdf` -# MAGIC - **Requirements**: Read access to the volume containing your PDF/image files -# MAGIC -# MAGIC ### 2. `image_output_path` -# MAGIC - **Description**: Full Unity Catalog volume path where `ai_parse_document` will store the extracted page images -# MAGIC - **Example**: `/Volumes/catalog/schema/volume/output/` -# MAGIC - **Requirements**: Write access required for storing intermediate image outputs -# MAGIC - **Note**: As documented in the [official Databricks documentation](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document), this path is used by the parsing function to store page images that are referenced in the output -# MAGIC -# MAGIC ### 3. `page_selection` -# MAGIC - **Description**: Specifies which pages to display in the visualization -# MAGIC - **Supported formats**: -# MAGIC - `"all"` or leave empty: Display all pages -# MAGIC - `"3"`: Display only page 3 (1-indexed) -# MAGIC - `"1-5"`: Display pages 1 through 5 (inclusive, 1-indexed) -# MAGIC - `"1,3,5"`: Display specific pages (1-indexed) -# MAGIC - `"1-3,7,10-12"`: Mixed ranges and individual pages -# MAGIC -# MAGIC ## Usage Instructions -# MAGIC -# MAGIC 1. **Clone this notebook** to your workspace: -# MAGIC - Select **"File -> Clone"** button in the top toolbar -# MAGIC - Choose your desired location in your workspace -# MAGIC - This ensures you have a personal copy you can modify and run -# MAGIC -# MAGIC 2. **Prepare your Unity Catalog volumes**: -# MAGIC - Create or identify a volume for your PDF/image files -# MAGIC - Create or identify a volume for output images -# MAGIC - Upload your PDF files to the input location -# MAGIC -# MAGIC 3. **Configure the widget parameters** at the top of this notebook: -# MAGIC - Set `input_file` to the full volume path (file or directory with wildcard) -# MAGIC - Set `image_output_path` to the full volume path for outputs -# MAGIC - Set `page_selection` to control which pages to visualize -# MAGIC -# MAGIC 4. **Run all code cells** which will generate visual debugging results. -# MAGIC -# MAGIC ## What You'll See -# MAGIC -# MAGIC - **Document Summary**: Overview of pages, element counts, and document metadata -# MAGIC - **Color Legend**: Visual guide showing which colors represent which element types -# MAGIC - **Annotated Images**: Each page with overlaid bounding boxes -# MAGIC - Hover over any box to see the extracted content -# MAGIC - Yellow highlight indicates the currently hovered element -# MAGIC - **Parsed Elements List**: Complete list of all extracted elements with their content - -# COMMAND ---------- - -# Exec Parameters - -dbutils.widgets.text("input_file", "/Volumes/main/default/source_documents/sample.pdf") -dbutils.widgets.text("image_output_path", "/Volumes/main/default/parsed_output/") -dbutils.widgets.text("page_selection", "all") - -input_file = dbutils.widgets.get("input_file") -image_output_path = dbutils.widgets.get("image_output_path") -page_selection = dbutils.widgets.get("page_selection") - -# COMMAND ---------- - -# DBTITLE 1,Configuration Parameters -# Path configuration - use widget values as-is - -source_files = input_file - -# Parse page selection string and return list of page indices to display. -# -# Supported formats: -# - "all" or None: Display all pages -# - "3": Display specific page (1-indexed) -# - "1-5": Display page range (inclusive, 1-indexed) -# - "1,3,5": Display list of specific pages (1-indexed) -# - "1-3,7,10-12": Mixed ranges and individual pages -page_selection = f"{page_selection}" - -# COMMAND ---------- - -# DBTITLE 1,Run Document Parse Code (may take some time) -# SQL statement with ai_parse_document() -# Note: input_file can be a single file path or a directory path with wildcard -sql = f''' -with parsed_documents AS ( - SELECT - path, - ai_parse_document(content - , - map( - 'version', '2.0', - 'imageOutputPath', '{image_output_path}', - 'descriptionElementTypes', '*' - ) - ) as parsed - FROM - read_files('{source_files}', format => 'binaryFile') -) -select * from parsed_documents -''' - -parsed_results = [row.parsed for row in spark.sql(sql).collect()] - -# COMMAND ---------- - -import json -from typing import Dict, List, Any, Optional, Tuple, Set, Union -from IPython.display import HTML, display -import base64 -import os -from PIL import Image -import io - -class DocumentRenderer: - def __init__(self): - # Color mapping for different element types - self.element_colors = { - 'section_header': '#FF6B6B', - 'text': '#4ECDC4', - 'figure': '#45B7D1', - 'caption': '#96CEB4', - 'page_footer': '#FFEAA7', - 'page_header': '#DDA0DD', - 'table': '#98D8C8', - 'list': '#F7DC6F', - 'default': '#BDC3C7' - } - - def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]: - """Parse page selection string and return set of page indices (0-based). - - Args: - page_selection: Selection string or None - total_pages: Total number of pages available - - Returns: - Set of 0-based page indices to display - """ - # Handle None or "all" - return all pages - if page_selection is None or page_selection.lower() == "all": - return set(range(total_pages)) - - selected_pages = set() - - # Clean the input - page_selection = page_selection.strip() - - # Split by commas for multiple selections - parts = page_selection.split(',') - - for part in parts: - part = part.strip() - - # Check if it's a range (contains hyphen) - if '-' in part: - try: - # Split range and convert to integers - range_parts = part.split('-') - if len(range_parts) == 2: - start = int(range_parts[0].strip()) - end = int(range_parts[1].strip()) - - # Convert from 1-indexed to 0-indexed - start_idx = start - 1 - end_idx = end - 1 - - # Add all pages in range (inclusive) - for i in range(start_idx, end_idx + 1): - if 0 <= i < total_pages: - selected_pages.add(i) - except ValueError: - print(f"Warning: Invalid range '{part}' in page selection") - else: - # Single page number - try: - page_num = int(part.strip()) - # Convert from 1-indexed to 0-indexed - page_idx = page_num - 1 - if 0 <= page_idx < total_pages: - selected_pages.add(page_idx) - else: - print(f"Warning: Page {page_num} is out of range (1-{total_pages})") - except ValueError: - print(f"Warning: Invalid page number '{part}' in page selection") - - # If no valid pages were selected, default to all pages - if not selected_pages: - print(f"Warning: No valid pages in selection '{page_selection}'. Showing all pages.") - return set(range(total_pages)) - - return selected_pages - - def _get_element_color(self, element_type: str) -> str: - """Get color for element type.""" - return self.element_colors.get(element_type.lower(), self.element_colors['default']) - - def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]: - """Get dimensions of an image file.""" - try: - if os.path.exists(image_path): - with Image.open(image_path) as img: - return img.size # Returns (width, height) - return None - except Exception as e: - print(f"Error getting image dimensions for {image_path}: {e}") - return None - - def _load_image_as_base64(self, image_path: str) -> Optional[str]: - """Load image from file path and convert to base64.""" - try: - if os.path.exists(image_path): - with open(image_path, 'rb') as img_file: - img_data = img_file.read() - img_base64 = base64.b64encode(img_data).decode('utf-8') - ext = os.path.splitext(image_path)[1].lower() - if ext in ['.jpg', '.jpeg']: - return f"data:image/jpeg;base64,{img_base64}" - elif ext in ['.png']: - return f"data:image/png;base64,{img_base64}" - else: - return f"data:image/jpeg;base64,{img_base64}" - return None - except Exception as e: - print(f"Error loading image {image_path}: {e}") - return None - - def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str: - """Render element content with appropriate formatting for both tooltip and element list display. - - Args: - element: The element dictionary containing content/description - for_tooltip: Whether this is for tooltip display (affects styling and truncation) - """ - element_type = element.get('type', 'unknown') - content = element.get('content', '') - description = element.get('description', '') - - display_content = "" - - if content: - if element_type == 'table': - # Render the HTML table with styling - table_html = content - - # Apply different styling based on context - if for_tooltip: - # Compact styling for tooltips with light theme - # Use full width available for tooltip tables - table_style = f'''style="width: 100%; border-collapse: collapse; margin: 5px 0; font-size: 10px;"''' - th_style = 'style="border: 1px solid #ddd; padding: 4px; background: #f8f9fa; color: #333; font-weight: bold; text-align: left; font-size: 10px;"' - td_style = 'style="border: 1px solid #ddd; padding: 4px; color: #333; font-size: 10px;"' - thead_style = 'style="background: #e9ecef;"' - else: - # Full styling for element list - table_style = '''style="width: 100%; border-collapse: collapse; margin: 10px 0; font-size: 13px;"''' - th_style = 'style="border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;"' - td_style = 'style="border: 1px solid #ddd; padding: 8px;"' - thead_style = 'style="background: #f0f0f0;"' - - # Apply styling transformations - if '' in table_html: - table_html = table_html.replace('
', f'
') - if '' in table_html: - table_html = table_html.replace('', f'') - - if for_tooltip: - display_content = table_html - else: - display_content = f"
{table_html}
" - else: - # Regular content handling - if for_tooltip and len(content) > 500: - # Truncate for tooltip display and escape HTML for safety - display_content = self._escape_for_html_attribute(content[:500] + "...") - else: - display_content = self._escape_for_html_attribute(content) if for_tooltip else content - elif description: - desc_content = description - if for_tooltip and len(desc_content) > 500: - desc_content = desc_content[:500] + "..." - - if for_tooltip: - display_content = self._escape_for_html_attribute(f"Description: {desc_content}") - else: - display_content = f"Description: {desc_content}" - else: - display_content = "No content available" if for_tooltip else "No content" - - return display_content - - def _escape_for_html_attribute(self, text: str) -> str: - """Escape text for safe use in HTML attributes.""" - return (text.replace('&', '&') - .replace('<', '<') - .replace('>', '>') - .replace('"', '"') - .replace("'", ''') - .replace('\n', '
')) - - def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int: - """Calculate dynamic tooltip width based on table content.""" - element_type = element.get('type', 'unknown') - content = element.get('content', '') - - if element_type == 'table' and content: - # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE) - if first_row_match: - first_row = first_row_match.group(1) - # Count th or td tags - th_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) - td_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) - column_count = max(th_count, td_count) - - if column_count > 0: - # Base width + additional width per column - base_width = 300 - width_per_column = 80 - calculated_width = base_width + (column_count * width_per_column) - - # Cap at 4/5th of image width - max_width = int(image_width * 0.8) - return min(calculated_width, max_width) - - # Default width for non-tables or when calculation fails - return 400 - - def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: - """Create annotated image with SCALING to fit within 1024px width.""" - image_uri = page.get('image_uri', '') - page_id = page.get('id', 0) - - if not image_uri: - return "

No image URI found for this page

" - - # Load image - img_data_uri = self._load_image_as_base64(image_uri) - if not img_data_uri: - return f""" -
- Could not load image: {image_uri}
- Make sure the file exists and is accessible. -
- """ - - # Get original image dimensions - original_dimensions = self._get_image_dimensions(image_uri) - if not original_dimensions: - # Fallback: display without explicit scaling - original_width, original_height = 1024, 768 # Default fallback - else: - original_width, original_height = original_dimensions - - # Calculate scaling factor to fit within 1024px width - max_display_width = 1024 - scale_factor = 1.0 - display_width = original_width - display_height = original_height - - if original_width > max_display_width: - scale_factor = max_display_width / original_width - display_width = max_display_width - display_height = int(original_height * scale_factor) - - # Filter elements for this page and collect their bounding boxes - page_elements = [] - - for elem in elements: - elem_bboxes = [] - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) == page_id: - coord = bbox.get('coord', []) - if len(coord) >= 4: - elem_bboxes.append(bbox) - - if elem_bboxes: - page_elements.append({ - 'element': elem, - 'bboxes': elem_bboxes - }) - - if not page_elements: - return f"

No elements found for page {page_id}

" - - header_info = f""" -
- Page {page_id + 1}: {len(page_elements)} elements
- Original size: {original_width}×{original_height}px | - Display size: {display_width}×{display_height}px | - Scale factor: {scale_factor:.3f}
-
- """ - - # Generate unique container ID for this page - container_id = f"page_container_{page_id}_{id(self)}" - - # Create bounding box overlays using SCALED coordinates with hover functionality - overlays = [] - - for idx, item in enumerate(page_elements): - element = item['element'] - element_id = element.get('id', 'N/A') - element_type = element.get('type', 'unknown') - color = self._get_element_color(element_type) - - # Use the shared content renderer for tooltip - tooltip_content = self._render_element_content(element, for_tooltip=True) - - # Calculate dynamic tooltip width - tooltip_width = self._calculate_tooltip_width(element, display_width) - - # Tables should render as HTML, other content should be escaped - - for bbox_idx, bbox in enumerate(item['bboxes']): - coord = bbox.get('coord', []) - if len(coord) >= 4: - x1, y1, x2, y2 = coord - - # Apply scaling to coordinates - scaled_x1 = x1 * scale_factor - scaled_y1 = y1 * scale_factor - scaled_x2 = x2 * scale_factor - scaled_y2 = y2 * scale_factor - - width = scaled_x2 - scaled_x1 - height = scaled_y2 - scaled_y1 - - # Skip invalid boxes - if width <= 0 or height <= 0: - continue - - # Position label above box when possible - label_top = -18 if scaled_y1 >= 18 else 2 - - # Unique ID for this bounding box - box_id = f"bbox_{page_id}_{idx}_{bbox_idx}" - - # Calculate tooltip position (prefer right side, but switch to left if needed) - tooltip_left = 10 - - overlay = f""" -
-
- {element_type.upper()[:6]}#{element_id} -
- -
-
- {element_type.upper()} #{element_id} -
-
- {tooltip_content} -
-
-
- """ - overlays.append(overlay) - - # Pure CSS hover functionality (works in Databricks) - styles = f""" - - """ - - return f""" - {header_info} - {styles} -
- Page {page_id + 1} - {''.join(overlays)} -
- """ - - def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str: - """Create a detailed list of elements for a specific page.""" - # Filter elements for this page - page_elements = [] - - for elem in elements: - elem_bboxes = [] - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) == page_id: - elem_bboxes.append(bbox) - - if elem_bboxes: - page_elements.append(elem) - - if not page_elements: - return f"

No elements found for page {page_id + 1}

" - - html_parts = [] - - for element in page_elements: - element_id = element.get('id', 'N/A') - element_type = element.get('type', 'unknown') - color = self._get_element_color(element_type) - - # Get bounding box info for this page only - bbox_info = "No bbox" - bbox_list = element.get('bbox', []) - if bbox_list: - bbox_details = [] - for bbox in bbox_list: - if bbox.get('page_id', 0) == page_id: - coord = bbox.get('coord', []) - if len(coord) >= 4: - bbox_details.append(f"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]") - bbox_info = "; ".join(bbox_details) if bbox_details else "Invalid bbox" - - # Use the shared content renderer for element list display - display_content = self._render_element_content(element, for_tooltip=False) - - element_html = f""" -
-
-

- {element_type.upper().replace('_', ' ')} (ID: {element_id}) -

- - {bbox_info} - -
-
- {display_content} -
-
- """ - html_parts.append(element_html) - - return f""" -
-

📋 Page {page_id + 1} Elements ({len(page_elements)} items)

- {''.join(html_parts)} -
- """ - - def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str: - """Create a summary with page selection info.""" - elements = document.get('elements', []) - - # Count elements only on selected pages - selected_elements = [] - for elem in elements: - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) in selected_pages: - selected_elements.append(elem) - break - - # Count by type (for selected pages) - type_counts = {} - for elem in selected_elements: - elem_type = elem.get('type', 'unknown') - type_counts[elem_type] = type_counts.get(elem_type, 0) + 1 - - type_list = ', '.join([f"{t}: {c}" for t, c in type_counts.items()]) - - # Create page selection info - if len(selected_pages) == total_pages: - page_info = f"All {total_pages} pages" - else: - # Convert to 1-indexed for display - page_nums = sorted([p + 1 for p in selected_pages]) - if len(page_nums) <= 10: - page_info = f"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})" - else: - page_info = f"{len(selected_pages)} of {total_pages} pages selected" - - return f""" -
-

📄 Document Summary

-

Displaying: {page_info}

-

Elements on selected pages: {len(selected_elements)}

-

Element Types: {type_list if type_list else 'None'}

-

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

-
- """ - - def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None: - """Main render function with page selection support. - - Args: - parsed_result: The parsed document result - page_selection: Page selection string. Supported formats: - - "all" or None: Display all pages - - "3": Display only page 3 (1-indexed) - - "1-5": Display pages 1 through 5 (inclusive) - - "1,3,5": Display specific pages - - "1-3,7,10-12": Mixed format - """ - try: - # Convert to dict - if hasattr(parsed_result, 'toPython'): - parsed_dict = parsed_result.toPython() - elif hasattr(parsed_result, 'toJson'): - parsed_dict = json.loads(parsed_result.toJson()) - elif isinstance(parsed_result, dict): - parsed_dict = parsed_result - else: - display(HTML(f"

❌ Could not convert result. Type: {type(parsed_result)}

")) - return - - # Extract components - document = parsed_dict.get('document', {}) - pages = document.get('pages', []) - elements = document.get('elements', []) - metadata = parsed_dict.get('metadata', {}) - - if not elements: - display(HTML("

❌ No elements found in document

")) - return - - # Parse page selection - selected_pages = self._parse_page_selection(page_selection, len(pages)) - - # Display title - display(HTML("

🔍 AI Parse Document Results

")) - - # Display summary with page selection info - summary_html = self._create_summary(document, metadata, selected_pages, len(pages)) - display(HTML(summary_html)) - - # Display color legend - legend_items = [] - for elem_type, color in self.element_colors.items(): - if elem_type != 'default': - legend_items.append(f""" - - - {elem_type.replace('_', ' ').title()} - - """) - - display(HTML(f""" -
- 🎨 Element Colors:
- {''.join(legend_items)} -
- """)) - - # Display annotated images with their corresponding elements (filtered by selection) - if pages: - display(HTML("

🖼️ Annotated Images & Elements

")) - - # Sort selected pages for display - sorted_selected = sorted(selected_pages) - - for page_idx in sorted_selected: - if page_idx < len(pages): - page = pages[page_idx] - - # Display the annotated image - annotated_html = self._create_annotated_image(page, elements) - display(HTML(f"
{annotated_html}
")) - - # Display elements for this page immediately after the image - page_id = page.get('id', page_idx) - page_elements_html = self._create_page_elements_list(page_id, elements) - display(HTML(page_elements_html)) - - except Exception as e: - display(HTML(f"

❌ Error: {str(e)}

")) - import traceback - display(HTML(f"
{traceback.format_exc()}
")) - - -# Simple usage functions -def render_ai_parse_output(parsed_result, page_selection=None): - """Simple function to render ai_parse_document output with page selection. - - Args: - parsed_result: The parsed document result - page_selection: Optional page selection string. Examples: - - None or "all": Display all pages - - "3": Display only page 3 - - "1-5": Display pages 1 through 5 - - "1,3,5": Display specific pages - - "1-3,7,10-12": Mixed format - """ - renderer = DocumentRenderer() - renderer.render_document(parsed_result, page_selection) - -# COMMAND ---------- - -# DBTITLE 1,Debug Visualization Results -for parsed_result in parsed_results: - render_ai_parse_output(parsed_result, page_selection) \ No newline at end of file From 9d616464b3fa005a5690ca890340e8297d7d1f7d Mon Sep 17 00:00:00 2001 From: "jas.bali" Date: Mon, 13 Oct 2025 19:31:06 -0400 Subject: [PATCH 3/6] Replace ipynb with py and html formats for better output visualization The ipynb format was not preserving the visual output properly. Using .py notebook source and .html export to show outputs correctly. --- .../ai_parse_document -- debug output.html | 45 + .../ai_parse_document -- debug output.ipynb | 7486 ----------------- .../ai_parse_document -- debug output.py | 782 ++ 3 files changed, 827 insertions(+), 7486 deletions(-) create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html delete mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb create mode 100644 knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html new file mode 100644 index 00000000..ebe618d8 --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html @@ -0,0 +1,45 @@ + + + + +ai_parse_document -- debug output - Databricks + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb deleted file mode 100644 index be7155ee..00000000 --- a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.ipynb +++ /dev/null @@ -1,7486 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b1fe924b-46e4-4cd2-ac59-712283f9af38", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "# \uD83D\uDD0D AI Parse Document Debug Interface\n", - "\n", - "Version 1.3\n", - "\n", - "Last update: Oct 6, 2025\n", - "\n", - "Changelog:\n", - "- Simplified widget parameters: `input_file` and `image_output_path` now accept full volume paths\n", - "- Removed separate `catalog`, `schema`, `volume` widgets\n", - "- `input_file` supports wildcards for processing multiple files (e.g., `/Volumes/catalog/schema/volume/input/*`)\n", - "\n", - "## Overview\n", - "This notebook provides a **visual debugging interface** for analyzing the output of Databricks' `ai_parse_document` function. It renders parsed documents with interactive bounding box overlays, allowing you to inspect what content was extracted from each region of your documents.\n", - "\n", - "## Features\n", - "- \uD83D\uDCCA **Visual Bounding Boxes**: Color-coded overlays showing the exact regions where text/elements were detected\n", - "- \uD83C\uDFAF **Interactive Tooltips**: Hover over any bounding box to see the parsed content from that region\n", - "- \uD83D\uDCD0 **Automatic Scaling**: Large documents are automatically scaled to fit within 1024px width for optimal viewing\n", - "- \uD83C\uDFA8 **Element Type Visualization**: Different colors for different element types (text, headers, tables, figures, etc.)\n", - "\n", - "## Required Parameters\n", - "\n", - "This interface requires widget parameters to be configured before running:\n", - "\n", - "### 1. `input_file`\n", - "- **Description**: Full Unity Catalog volume path to the document(s) you want to parse and visualize\n", - "- **Examples**:\n", - " - Single file: `/Volumes/catalog/schema/volume/input/document.pdf`\n", - " - All files in directory: `/Volumes/catalog/schema/volume/input/*`\n", - " - Pattern matching: `/Volumes/catalog/schema/volume/input/*.pdf`\n", - "- **Requirements**: Read access to the volume containing your PDF/image files\n", - "\n", - "### 2. `image_output_path`\n", - "- **Description**: Full Unity Catalog volume path where `ai_parse_document` will store the extracted page images\n", - "- **Example**: `/Volumes/catalog/schema/volume/output/`\n", - "- **Requirements**: Write access required for storing intermediate image outputs\n", - "- **Note**: As documented in the [official Databricks documentation](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document), this path is used by the parsing function to store page images that are referenced in the output\n", - "\n", - "### 3. `page_selection`\n", - "- **Description**: Specifies which pages to display in the visualization\n", - "- **Supported formats**:\n", - " - `\"all\"` or leave empty: Display all pages\n", - " - `\"3\"`: Display only page 3 (1-indexed)\n", - " - `\"1-5\"`: Display pages 1 through 5 (inclusive, 1-indexed)\n", - " - `\"1,3,5\"`: Display specific pages (1-indexed)\n", - " - `\"1-3,7,10-12\"`: Mixed ranges and individual pages\n", - "\n", - "## Usage Instructions\n", - "\n", - "1. **Clone this notebook** to your workspace:\n", - " - Select **\"File -> Clone\"** button in the top toolbar\n", - " - Choose your desired location in your workspace\n", - " - This ensures you have a personal copy you can modify and run\n", - "\n", - "2. **Prepare your Unity Catalog volumes**:\n", - " - Create or identify a volume for your PDF/image files\n", - " - Create or identify a volume for output images\n", - " - Upload your PDF files to the input location\n", - "\n", - "3. **Configure the widget parameters** at the top of this notebook:\n", - " - Set `input_file` to the full volume path (file or directory with wildcard)\n", - " - Set `image_output_path` to the full volume path for outputs\n", - " - Set `page_selection` to control which pages to visualize\n", - "\n", - "4. **Run all code cells** which will generate visual debugging results.\n", - "\n", - "## What You'll See\n", - "\n", - "- **Document Summary**: Overview of pages, element counts, and document metadata\n", - "- **Color Legend**: Visual guide showing which colors represent which element types\n", - "- **Annotated Images**: Each page with overlaid bounding boxes\n", - " - Hover over any box to see the extracted content\n", - " - Yellow highlight indicates the currently hovered element\n", - "- **Parsed Elements List**: Complete list of all extracted elements with their content" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "2763f01c-f6f2-47b4-9a0a-bfeccfd131ca", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Exec Parameters\n", - "\n", - "dbutils.widgets.text(\"input_file\", \"/Volumes/main/default/source_documents/sample.pdf\")\n", - "dbutils.widgets.text(\"image_output_path\", \"/Volumes/main/default/parsed_output/\")\n", - "dbutils.widgets.text(\"page_selection\", \"all\")\n", - "\n", - "input_file = dbutils.widgets.get(\"input_file\")\n", - "image_output_path = dbutils.widgets.get(\"image_output_path\")\n", - "page_selection = dbutils.widgets.get(\"page_selection\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "cbdcca83-af8c-427b-b82a-33d5cd73b998", - "showTitle": true, - "tableResultSettingsMap": {}, - "title": "Configuration Parameters" - } - }, - "outputs": [], - "source": [ - "# Path configuration - use widget values as-is\n", - "\n", - "source_files = input_file\n", - "\n", - "# Parse page selection string and return list of page indices to display.\n", - "#\n", - "# Supported formats:\n", - "# - \"all\" or None: Display all pages\n", - "# - \"3\": Display specific page (1-indexed)\n", - "# - \"1-5\": Display page range (inclusive, 1-indexed)\n", - "# - \"1,3,5\": Display list of specific pages (1-indexed)\n", - "# - \"1-3,7,10-12\": Mixed ranges and individual pages\n", - "page_selection = f\"{page_selection}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8eb28801-7c66-4d5f-9ee6-66d8e10bd45e", - "showTitle": true, - "tableResultSettingsMap": {}, - "title": "Run Document Parse Code (may take some time)" - } - }, - "outputs": [], - "source": [ - "# SQL statement with ai_parse_document()\n", - "# Note: input_file can be a single file path or a directory path with wildcard\n", - "sql = f'''\n", - "with parsed_documents AS (\n", - " SELECT\n", - " path,\n", - " ai_parse_document(content\n", - " ,\n", - " map(\n", - " 'version', '2.0',\n", - " 'imageOutputPath', '{image_output_path}',\n", - " 'descriptionElementTypes', '*'\n", - " )\n", - " ) as parsed\n", - " FROM\n", - " read_files('{source_files}', format => 'binaryFile')\n", - ")\n", - "select * from parsed_documents\n", - "'''\n", - "\n", - "parsed_results = [row.parsed for row in spark.sql(sql).collect()]" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "3770f490-a617-46f9-9904-e85249dc0f33", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "import json\n", - "from typing import Dict, List, Any, Optional, Tuple, Set, Union\n", - "from IPython.display import HTML, display\n", - "import base64\n", - "import os\n", - "from PIL import Image\n", - "import io\n", - "\n", - "class DocumentRenderer:\n", - " def __init__(self):\n", - " # Color mapping for different element types\n", - " self.element_colors = {\n", - " 'section_header': '#FF6B6B',\n", - " 'text': '#4ECDC4', \n", - " 'figure': '#45B7D1',\n", - " 'caption': '#96CEB4',\n", - " 'page_footer': '#FFEAA7',\n", - " 'page_header': '#DDA0DD',\n", - " 'table': '#98D8C8',\n", - " 'list': '#F7DC6F',\n", - " 'default': '#BDC3C7'\n", - " }\n", - " \n", - " def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]:\n", - " \"\"\"Parse page selection string and return set of page indices (0-based).\n", - " \n", - " Args:\n", - " page_selection: Selection string or None\n", - " total_pages: Total number of pages available\n", - " \n", - " Returns:\n", - " Set of 0-based page indices to display\n", - " \"\"\"\n", - " # Handle None or \"all\" - return all pages\n", - " if page_selection is None or page_selection.lower() == \"all\":\n", - " return set(range(total_pages))\n", - " \n", - " selected_pages = set()\n", - " \n", - " # Clean the input\n", - " page_selection = page_selection.strip()\n", - " \n", - " # Split by commas for multiple selections\n", - " parts = page_selection.split(',')\n", - " \n", - " for part in parts:\n", - " part = part.strip()\n", - " \n", - " # Check if it's a range (contains hyphen)\n", - " if '-' in part:\n", - " try:\n", - " # Split range and convert to integers\n", - " range_parts = part.split('-')\n", - " if len(range_parts) == 2:\n", - " start = int(range_parts[0].strip())\n", - " end = int(range_parts[1].strip())\n", - " \n", - " # Convert from 1-indexed to 0-indexed\n", - " start_idx = start - 1\n", - " end_idx = end - 1\n", - " \n", - " # Add all pages in range (inclusive)\n", - " for i in range(start_idx, end_idx + 1):\n", - " if 0 <= i < total_pages:\n", - " selected_pages.add(i)\n", - " except ValueError:\n", - " print(f\"Warning: Invalid range '{part}' in page selection\")\n", - " else:\n", - " # Single page number\n", - " try:\n", - " page_num = int(part.strip())\n", - " # Convert from 1-indexed to 0-indexed\n", - " page_idx = page_num - 1\n", - " if 0 <= page_idx < total_pages:\n", - " selected_pages.add(page_idx)\n", - " else:\n", - " print(f\"Warning: Page {page_num} is out of range (1-{total_pages})\")\n", - " except ValueError:\n", - " print(f\"Warning: Invalid page number '{part}' in page selection\")\n", - " \n", - " # If no valid pages were selected, default to all pages\n", - " if not selected_pages:\n", - " print(f\"Warning: No valid pages in selection '{page_selection}'. Showing all pages.\")\n", - " return set(range(total_pages))\n", - " \n", - " return selected_pages\n", - " \n", - " def _get_element_color(self, element_type: str) -> str:\n", - " \"\"\"Get color for element type.\"\"\"\n", - " return self.element_colors.get(element_type.lower(), self.element_colors['default'])\n", - " \n", - " def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]:\n", - " \"\"\"Get dimensions of an image file.\"\"\"\n", - " try:\n", - " if os.path.exists(image_path):\n", - " with Image.open(image_path) as img:\n", - " return img.size # Returns (width, height)\n", - " return None\n", - " except Exception as e:\n", - " print(f\"Error getting image dimensions for {image_path}: {e}\")\n", - " return None\n", - " \n", - " def _load_image_as_base64(self, image_path: str) -> Optional[str]:\n", - " \"\"\"Load image from file path and convert to base64.\"\"\"\n", - " try:\n", - " if os.path.exists(image_path):\n", - " with open(image_path, 'rb') as img_file:\n", - " img_data = img_file.read()\n", - " img_base64 = base64.b64encode(img_data).decode('utf-8')\n", - " ext = os.path.splitext(image_path)[1].lower()\n", - " if ext in ['.jpg', '.jpeg']:\n", - " return f\"data:image/jpeg;base64,{img_base64}\"\n", - " elif ext in ['.png']:\n", - " return f\"data:image/png;base64,{img_base64}\"\n", - " else:\n", - " return f\"data:image/jpeg;base64,{img_base64}\"\n", - " return None\n", - " except Exception as e:\n", - " print(f\"Error loading image {image_path}: {e}\")\n", - " return None\n", - " \n", - " def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str:\n", - " \"\"\"Render element content with appropriate formatting for both tooltip and element list display.\n", - " \n", - " Args:\n", - " element: The element dictionary containing content/description\n", - " for_tooltip: Whether this is for tooltip display (affects styling and truncation)\n", - " \"\"\"\n", - " element_type = element.get('type', 'unknown')\n", - " content = element.get('content', '')\n", - " description = element.get('description', '')\n", - " \n", - " display_content = \"\"\n", - " \n", - " if content:\n", - " if element_type == 'table':\n", - " # Render the HTML table with styling\n", - " table_html = content\n", - " \n", - " # Apply different styling based on context\n", - " if for_tooltip:\n", - " # Compact styling for tooltips with light theme\n", - " # Use full width available for tooltip tables\n", - " table_style = f'''style=\"width: 100%; border-collapse: collapse; margin: 5px 0; font-size: 10px;\"'''\n", - " th_style = 'style=\"border: 1px solid #ddd; padding: 4px; background: #f8f9fa; color: #333; font-weight: bold; text-align: left; font-size: 10px;\"'\n", - " td_style = 'style=\"border: 1px solid #ddd; padding: 4px; color: #333; font-size: 10px;\"'\n", - " thead_style = 'style=\"background: #e9ecef;\"'\n", - " else:\n", - " # Full styling for element list\n", - " table_style = '''style=\"width: 100%; border-collapse: collapse; margin: 10px 0; font-size: 13px;\"'''\n", - " th_style = 'style=\"border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;\"'\n", - " td_style = 'style=\"border: 1px solid #ddd; padding: 8px;\"'\n", - " thead_style = 'style=\"background: #f0f0f0;\"'\n", - " \n", - " # Apply styling transformations\n", - " if '
' in table_html: - table_html = table_html.replace('', f'') - if '' in table_html: - table_html = table_html.replace('', f'') - if '
or tags in first row - import re - - # Find first row (either in thead or tbody) - first_row_match = re.search(r']*>(.*?)
' in table_html:\n", - " table_html = table_html.replace('
', f'
')\n", - " if '' in table_html:\n", - " table_html = table_html.replace('', f'')\n", - " \n", - " if for_tooltip:\n", - " display_content = table_html\n", - " else:\n", - " display_content = f\"
{table_html}
\"\n", - " else:\n", - " # Regular content handling\n", - " if for_tooltip and len(content) > 500:\n", - " # Truncate for tooltip display and escape HTML for safety\n", - " display_content = self._escape_for_html_attribute(content[:500] + \"...\")\n", - " else:\n", - " display_content = self._escape_for_html_attribute(content) if for_tooltip else content\n", - " elif description:\n", - " desc_content = description\n", - " if for_tooltip and len(desc_content) > 500:\n", - " desc_content = desc_content[:500] + \"...\"\n", - " \n", - " if for_tooltip:\n", - " display_content = self._escape_for_html_attribute(f\"Description: {desc_content}\")\n", - " else:\n", - " display_content = f\"Description: {desc_content}\"\n", - " else:\n", - " display_content = \"No content available\" if for_tooltip else \"No content\"\n", - " \n", - " return display_content\n", - " \n", - " def _escape_for_html_attribute(self, text: str) -> str:\n", - " \"\"\"Escape text for safe use in HTML attributes.\"\"\"\n", - " return (text.replace('&', '&')\n", - " .replace('<', '<')\n", - " .replace('>', '>')\n", - " .replace('\"', '"')\n", - " .replace(\"'\", ''')\n", - " .replace('\\n', '
'))\n", - " \n", - " def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int:\n", - " \"\"\"Calculate dynamic tooltip width based on table content.\"\"\"\n", - " element_type = element.get('type', 'unknown')\n", - " content = element.get('content', '')\n", - " \n", - " if element_type == 'table' and content:\n", - " # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE)\n", - " if first_row_match:\n", - " first_row = first_row_match.group(1)\n", - " # Count th or td tags\n", - " th_count = len(re.findall(r']*>', first_row, re.IGNORECASE))\n", - " td_count = len(re.findall(r']*>', first_row, re.IGNORECASE))\n", - " column_count = max(th_count, td_count)\n", - " \n", - " if column_count > 0:\n", - " # Base width + additional width per column\n", - " base_width = 300\n", - " width_per_column = 80\n", - " calculated_width = base_width + (column_count * width_per_column)\n", - " \n", - " # Cap at 4/5th of image width\n", - " max_width = int(image_width * 0.8)\n", - " return min(calculated_width, max_width)\n", - " \n", - " # Default width for non-tables or when calculation fails\n", - " return 400\n", - " \n", - " def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str:\n", - " \"\"\"Create annotated image with SCALING to fit within 1024px width.\"\"\"\n", - " image_uri = page.get('image_uri', '')\n", - " page_id = page.get('id', 0)\n", - " \n", - " if not image_uri:\n", - " return \"

No image URI found for this page

\"\n", - " \n", - " # Load image\n", - " img_data_uri = self._load_image_as_base64(image_uri)\n", - " if not img_data_uri:\n", - " return f\"\"\"\n", - "
\n", - " Could not load image: {image_uri}
\n", - " Make sure the file exists and is accessible.\n", - "
\n", - " \"\"\"\n", - " \n", - " # Get original image dimensions\n", - " original_dimensions = self._get_image_dimensions(image_uri)\n", - " if not original_dimensions:\n", - " # Fallback: display without explicit scaling\n", - " original_width, original_height = 1024, 768 # Default fallback\n", - " else:\n", - " original_width, original_height = original_dimensions\n", - " \n", - " # Calculate scaling factor to fit within 1024px width\n", - " max_display_width = 1024\n", - " scale_factor = 1.0\n", - " display_width = original_width\n", - " display_height = original_height\n", - " \n", - " if original_width > max_display_width:\n", - " scale_factor = max_display_width / original_width\n", - " display_width = max_display_width\n", - " display_height = int(original_height * scale_factor)\n", - " \n", - " # Filter elements for this page and collect their bounding boxes\n", - " page_elements = []\n", - " \n", - " for elem in elements:\n", - " elem_bboxes = []\n", - " for bbox in elem.get('bbox', []):\n", - " if bbox.get('page_id', 0) == page_id:\n", - " coord = bbox.get('coord', [])\n", - " if len(coord) >= 4:\n", - " elem_bboxes.append(bbox)\n", - " \n", - " if elem_bboxes:\n", - " page_elements.append({\n", - " 'element': elem,\n", - " 'bboxes': elem_bboxes\n", - " })\n", - " \n", - " if not page_elements:\n", - " return f\"

No elements found for page {page_id}

\"\n", - " \n", - " header_info = f\"\"\"\n", - "
\n", - " Page {page_id + 1}: {len(page_elements)} elements
\n", - " Original size: {original_width}×{original_height}px | \n", - " Display size: {display_width}×{display_height}px | \n", - " Scale factor: {scale_factor:.3f}
\n", - "
\n", - " \"\"\"\n", - " \n", - " # Generate unique container ID for this page\n", - " container_id = f\"page_container_{page_id}_{id(self)}\"\n", - " \n", - " # Create bounding box overlays using SCALED coordinates with hover functionality\n", - " overlays = []\n", - " \n", - " for idx, item in enumerate(page_elements):\n", - " element = item['element']\n", - " element_id = element.get('id', 'N/A')\n", - " element_type = element.get('type', 'unknown')\n", - " color = self._get_element_color(element_type)\n", - " \n", - " # Use the shared content renderer for tooltip\n", - " tooltip_content = self._render_element_content(element, for_tooltip=True)\n", - " \n", - " # Calculate dynamic tooltip width\n", - " tooltip_width = self._calculate_tooltip_width(element, display_width)\n", - " \n", - " # Tables should render as HTML, other content should be escaped\n", - " \n", - " for bbox_idx, bbox in enumerate(item['bboxes']):\n", - " coord = bbox.get('coord', [])\n", - " if len(coord) >= 4:\n", - " x1, y1, x2, y2 = coord\n", - " \n", - " # Apply scaling to coordinates\n", - " scaled_x1 = x1 * scale_factor\n", - " scaled_y1 = y1 * scale_factor\n", - " scaled_x2 = x2 * scale_factor\n", - " scaled_y2 = y2 * scale_factor\n", - " \n", - " width = scaled_x2 - scaled_x1\n", - " height = scaled_y2 - scaled_y1\n", - " \n", - " # Skip invalid boxes\n", - " if width <= 0 or height <= 0:\n", - " continue\n", - " \n", - " # Position label above box when possible\n", - " label_top = -18 if scaled_y1 >= 18 else 2\n", - " \n", - " # Unique ID for this bounding box\n", - " box_id = f\"bbox_{page_id}_{idx}_{bbox_idx}\"\n", - " \n", - " # Calculate tooltip position (prefer right side, but switch to left if needed)\n", - " tooltip_left = 10\n", - " \n", - " overlay = f\"\"\"\n", - "
\n", - "
\n", - " {element_type.upper()[:6]}#{element_id}\n", - "
\n", - " \n", - "
\n", - "
\n", - " {element_type.upper()} #{element_id}\n", - "
\n", - "
\n", - " {tooltip_content}\n", - "
\n", - "
\n", - "
\n", - " \"\"\"\n", - " overlays.append(overlay)\n", - " \n", - " # Pure CSS hover functionality (works in Databricks)\n", - " styles = f\"\"\"\n", - " \n", - " \"\"\"\n", - " \n", - " return f\"\"\"\n", - " {header_info}\n", - " {styles}\n", - "
\n", - " \"Page\n", - " {''.join(overlays)}\n", - "
\n", - " \"\"\"\n", - " \n", - " def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str:\n", - " \"\"\"Create a detailed list of elements for a specific page.\"\"\"\n", - " # Filter elements for this page\n", - " page_elements = []\n", - " \n", - " for elem in elements:\n", - " elem_bboxes = []\n", - " for bbox in elem.get('bbox', []):\n", - " if bbox.get('page_id', 0) == page_id:\n", - " elem_bboxes.append(bbox)\n", - " \n", - " if elem_bboxes:\n", - " page_elements.append(elem)\n", - " \n", - " if not page_elements:\n", - " return f\"

No elements found for page {page_id + 1}

\"\n", - " \n", - " html_parts = []\n", - " \n", - " for element in page_elements:\n", - " element_id = element.get('id', 'N/A')\n", - " element_type = element.get('type', 'unknown')\n", - " color = self._get_element_color(element_type)\n", - " \n", - " # Get bounding box info for this page only\n", - " bbox_info = \"No bbox\"\n", - " bbox_list = element.get('bbox', [])\n", - " if bbox_list:\n", - " bbox_details = []\n", - " for bbox in bbox_list:\n", - " if bbox.get('page_id', 0) == page_id:\n", - " coord = bbox.get('coord', [])\n", - " if len(coord) >= 4:\n", - " bbox_details.append(f\"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]\")\n", - " bbox_info = \"; \".join(bbox_details) if bbox_details else \"Invalid bbox\"\n", - " \n", - " # Use the shared content renderer for element list display\n", - " display_content = self._render_element_content(element, for_tooltip=False)\n", - " \n", - " element_html = f\"\"\"\n", - "
\n", - "
\n", - "

\n", - " {element_type.upper().replace('_', ' ')} (ID: {element_id})\n", - "

\n", - " \n", - " {bbox_info}\n", - " \n", - "
\n", - "
\n", - " {display_content}\n", - "
\n", - "
\n", - " \"\"\"\n", - " html_parts.append(element_html)\n", - " \n", - " return f\"\"\"\n", - "
\n", - "

\uD83D\uDCCB Page {page_id + 1} Elements ({len(page_elements)} items)

\n", - " {''.join(html_parts)}\n", - "
\n", - " \"\"\"\n", - " \n", - " def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str:\n", - " \"\"\"Create a summary with page selection info.\"\"\"\n", - " elements = document.get('elements', [])\n", - " \n", - " # Count elements only on selected pages\n", - " selected_elements = []\n", - " for elem in elements:\n", - " for bbox in elem.get('bbox', []):\n", - " if bbox.get('page_id', 0) in selected_pages:\n", - " selected_elements.append(elem)\n", - " break\n", - " \n", - " # Count by type (for selected pages)\n", - " type_counts = {}\n", - " for elem in selected_elements:\n", - " elem_type = elem.get('type', 'unknown')\n", - " type_counts[elem_type] = type_counts.get(elem_type, 0) + 1\n", - " \n", - " type_list = ', '.join([f\"{t}: {c}\" for t, c in type_counts.items()])\n", - " \n", - " # Create page selection info\n", - " if len(selected_pages) == total_pages:\n", - " page_info = f\"All {total_pages} pages\"\n", - " else:\n", - " # Convert to 1-indexed for display\n", - " page_nums = sorted([p + 1 for p in selected_pages])\n", - " if len(page_nums) <= 10:\n", - " page_info = f\"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})\"\n", - " else:\n", - " page_info = f\"{len(selected_pages)} of {total_pages} pages selected\"\n", - " \n", - " return f\"\"\"\n", - "
\n", - "

\uD83D\uDCC4 Document Summary

\n", - "

Displaying: {page_info}

\n", - "

Elements on selected pages: {len(selected_elements)}

\n", - "

Element Types: {type_list if type_list else 'None'}

\n", - "

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

\n", - "
\n", - " \"\"\"\n", - " \n", - " def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None:\n", - " \"\"\"Main render function with page selection support.\n", - " \n", - " Args:\n", - " parsed_result: The parsed document result\n", - " page_selection: Page selection string. Supported formats:\n", - " - \"all\" or None: Display all pages\n", - " - \"3\": Display only page 3 (1-indexed)\n", - " - \"1-5\": Display pages 1 through 5 (inclusive)\n", - " - \"1,3,5\": Display specific pages\n", - " - \"1-3,7,10-12\": Mixed format\n", - " \"\"\"\n", - " try:\n", - " # Convert to dict\n", - " if hasattr(parsed_result, 'toPython'):\n", - " parsed_dict = parsed_result.toPython()\n", - " elif hasattr(parsed_result, 'toJson'):\n", - " parsed_dict = json.loads(parsed_result.toJson())\n", - " elif isinstance(parsed_result, dict):\n", - " parsed_dict = parsed_result\n", - " else:\n", - " display(HTML(f\"

❌ Could not convert result. Type: {type(parsed_result)}

\"))\n", - " return\n", - " \n", - " # Extract components\n", - " document = parsed_dict.get('document', {})\n", - " pages = document.get('pages', [])\n", - " elements = document.get('elements', [])\n", - " metadata = parsed_dict.get('metadata', {})\n", - " \n", - " if not elements:\n", - " display(HTML(\"

❌ No elements found in document

\"))\n", - " return\n", - " \n", - " # Parse page selection\n", - " selected_pages = self._parse_page_selection(page_selection, len(pages))\n", - " \n", - " # Display title\n", - " display(HTML(\"

\uD83D\uDD0D AI Parse Document Results

\"))\n", - " \n", - " # Display summary with page selection info\n", - " summary_html = self._create_summary(document, metadata, selected_pages, len(pages))\n", - " display(HTML(summary_html))\n", - " \n", - " # Display color legend\n", - " legend_items = []\n", - " for elem_type, color in self.element_colors.items():\n", - " if elem_type != 'default':\n", - " legend_items.append(f\"\"\"\n", - " \n", - " \n", - " {elem_type.replace('_', ' ').title()}\n", - " \n", - " \"\"\")\n", - " \n", - " display(HTML(f\"\"\"\n", - "
\n", - " \uD83C\uDFA8 Element Colors:
\n", - " {''.join(legend_items)}\n", - "
\n", - " \"\"\"))\n", - " \n", - " # Display annotated images with their corresponding elements (filtered by selection)\n", - " if pages:\n", - " display(HTML(\"

\uD83D\uDDBC️ Annotated Images & Elements

\"))\n", - " \n", - " # Sort selected pages for display\n", - " sorted_selected = sorted(selected_pages)\n", - " \n", - " for page_idx in sorted_selected:\n", - " if page_idx < len(pages):\n", - " page = pages[page_idx]\n", - " \n", - " # Display the annotated image\n", - " annotated_html = self._create_annotated_image(page, elements)\n", - " display(HTML(f\"
{annotated_html}
\"))\n", - " \n", - " # Display elements for this page immediately after the image\n", - " page_id = page.get('id', page_idx)\n", - " page_elements_html = self._create_page_elements_list(page_id, elements)\n", - " display(HTML(page_elements_html))\n", - " \n", - " except Exception as e:\n", - " display(HTML(f\"

❌ Error: {str(e)}

\"))\n", - " import traceback\n", - " display(HTML(f\"
{traceback.format_exc()}
\"))\n", - "\n", - "\n", - "# Simple usage functions\n", - "def render_ai_parse_output(parsed_result, page_selection=None):\n", - " \"\"\"Simple function to render ai_parse_document output with page selection.\n", - " \n", - " Args:\n", - " parsed_result: The parsed document result\n", - " page_selection: Optional page selection string. Examples:\n", - " - None or \"all\": Display all pages\n", - " - \"3\": Display only page 3\n", - " - \"1-5\": Display pages 1 through 5\n", - " - \"1,3,5\": Display specific pages\n", - " - \"1-3,7,10-12\": Mixed format\n", - " \"\"\"\n", - " renderer = DocumentRenderer()\n", - " renderer.render_document(parsed_result, page_selection)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "7498a4a3-7443-4503-bd13-10014953e73b", - "showTitle": true, - "tableResultSettingsMap": {}, - "title": "Debug Visualization Results" - } - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "

\uD83D\uDD0D AI Parse Document Results

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCC4 Document Summary

\n", - "

Displaying: Pages 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 (10 of 78)

\n", - "

Elements on selected pages: 84

\n", - "

Element Types: text: 33, title: 1, figure: 18, page_header: 18, section_header: 12, table: 1, caption: 1

\n", - "

Document ID: 3a07bf7a-e00...

\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - " \uD83C\uDFA8 Element Colors:
\n", - " \n", - " \n", - " \n", - " Section Header\n", - " \n", - " \n", - " \n", - " \n", - " Text\n", - " \n", - " \n", - " \n", - " \n", - " Figure\n", - " \n", - " \n", - " \n", - " \n", - " Caption\n", - " \n", - " \n", - " \n", - " \n", - " Page Footer\n", - " \n", - " \n", - " \n", - " \n", - " Page Header\n", - " \n", - " \n", - " \n", - " \n", - " Table\n", - " \n", - " \n", - " \n", - " \n", - " List\n", - " \n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "

\uD83D\uDDBC️ Annotated Images & Elements

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 1: 11 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " TEXT#0\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #0\n", - "
\n", - "
\n", - " eBook\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TITLE#1\n", - "
\n", - " \n", - "
\n", - "
\n", - " TITLE #1\n", - "
\n", - "
\n", - " The Big Book of MLOps\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#2\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #2\n", - "
\n", - "
\n", - " NOW INCLUDING A SECTION ON LLMOPS\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#3\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #3\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" alongside a stylized, geometric design composed of red and white triangles.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#4\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #4\n", - "
\n", - "
\n", - " 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#5\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #5\n", - "
\n", - "
\n", - " Description: Two teal and green hexagonal icons display stylized figures and abstract circular designs connected by lines.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#6\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #6\n", - "
\n", - "
\n", - " Description: An orange hexagon contains a circular design featuring a gear, wrench, and a person icon.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#7\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #7\n", - "
\n", - "
\n", - " MODELOPS\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#8\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #8\n", - "
\n", - "
\n", - " DATAOPS\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#9\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #9\n", - "
\n", - "
\n", - " DEVOPS\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#10\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #10\n", - "
\n", - "
\n", - " JOSEPH BRADLEY RAFI KURLANSIK MATT THOMSON NIALL TURBITT\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 1 Elements (11 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 0)\n", - "

\n", - " \n", - " [118, 323, 208, 353]\n", - " \n", - "
\n", - "
\n", - " eBook\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TITLE (ID: 1)\n", - "

\n", - " \n", - " [118, 374, 659, 587]\n", - " \n", - "
\n", - "
\n", - " The Big Book of MLOps\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 2)\n", - "

\n", - " \n", - " [118, 647, 457, 724]\n", - " \n", - "
\n", - "
\n", - " NOW INCLUDING A SECTION ON LLMOPS\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 3)\n", - "

\n", - " \n", - " [120, 1186, 358, 1226]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" alongside a stylized, geometric design composed of red and white triangles.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 4)\n", - "

\n", - " \n", - " [1531, 39, 1674, 181]\n", - " \n", - "
\n", - "
\n", - " 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 5)\n", - "

\n", - " \n", - " [655, 570, 1131, 793]\n", - " \n", - "
\n", - "
\n", - " Description: Two teal and green hexagonal icons display stylized figures and abstract circular designs connected by lines.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 6)\n", - "

\n", - " \n", - " [1211, 570, 1409, 793]\n", - " \n", - "
\n", - "
\n", - " Description: An orange hexagon contains a circular design featuring a gear, wrench, and a person icon.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 7)\n", - "

\n", - " \n", - " [684, 874, 813, 896]\n", - " \n", - "
\n", - "
\n", - " MODELOPS\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 8)\n", - "

\n", - " \n", - " [977, 874, 1083, 896]\n", - " \n", - "
\n", - "
\n", - " DATAOPS\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 9)\n", - "

\n", - " \n", - " [1259, 874, 1356, 896]\n", - " \n", - "
\n", - "
\n", - " DEVOPS\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 10)\n", - "

\n", - " \n", - " [853, 1201, 1565, 1220]\n", - " \n", - "
\n", - "
\n", - " JOSEPH BRADLEY RAFI KURLANSIK MATT THOMSON NIALL TURBITT\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 2: 5 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#11\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #11\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#12\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #12\n", - "
\n", - "
\n", - " 2\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#13\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #13\n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#14\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #14\n", - "
\n", - "
\n", - " CHAPTER 1 Introduction ......<DOT_LEADER> 5
CHAPTER 2 Big Book of MLOps V1 Recap ......<DOT_LEADER> 6
Why should I care about MLOps? ......<DOT_LEADER> 6
Guiding principles ......<DOT_LEADER> 6
Semantics of development, staging and production ......<DOT_LEADER> 7
ML deployment patterns ......<DOT_LEADER> 8
CHAPTER 3 What's New? ......<DOT_LEADER> 10
Unity Catalog ......<DOT_LEADER> 10
Benefits and architecture implications ......<DOT_LEADER> 11
Model Serving ......<DOT_LEADER> 13
Benefits and ar...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#15\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #15\n", - "
\n", - "
\n", - " Description: The image displays the Databricks logo, featuring a stylized red and gray geometric design alongside the company name in bold text.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 2 Elements (5 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 11)\n", - "

\n", - " \n", - " [59, 61, 347, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 12)\n", - "

\n", - " \n", - " [1634, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " 2\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 13)\n", - "

\n", - " \n", - " [59, 231, 297, 287]\n", - " \n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 14)\n", - "

\n", - " \n", - " [487, 257, 1601, 1056]\n", - " \n", - "
\n", - "
\n", - " CHAPTER 1 Introduction ...... 5\n", - "CHAPTER 2 Big Book of MLOps V1 Recap ...... 6\n", - "Why should I care about MLOps? ...... 6\n", - "Guiding principles ...... 6\n", - "Semantics of development, staging and production ...... 7\n", - "ML deployment patterns ...... 8\n", - "CHAPTER 3 What's New? ...... 10\n", - "Unity Catalog ...... 10\n", - "Benefits and architecture implications ...... 11\n", - "Model Serving ...... 13\n", - "Benefits and architecture implications ...... 13\n", - "Lakehouse Monitoring ...... 15\n", - "Benefits and architecture implications ...... 15\n", - "CHAPTER 4 Design Decisions ...... 17\n", - "Unity Catalog ...... 17\n", - "Organizing data and Al assets ...... 17\n", - "Concepts ...... 18\n", - "Considerations ...... 21\n", - "Recommended organization ...... 23\n", - "Model Serving...... 27\n", - "Pre-deployment testing ...... 28\n", - "Real-time model deployment ...... 29\n", - "Implementing in Databricks ...... 30\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 15)\n", - "

\n", - " \n", - " [59, 1181, 322, 1226]\n", - " \n", - "
\n", - "
\n", - " Description: The image displays the Databricks logo, featuring a stylized red and gray geometric design alongside the company name in bold text.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 3: 5 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#16\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #16\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#17\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #17\n", - "
\n", - "
\n", - " 3\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#18\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #18\n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TABLE#19\n", - "
\n", - " \n", - "
\n", - "
\n", - " TABLE #19\n", - "
\n", - "
\n", - "
' in table_html:\n", - " table_html = table_html.replace('', f'')\n", - " if '' in table_html:\n", - " table_html = table_html.replace('', f'')\n", - " if '
or tags in first row\n", - " import re\n", - " \n", - " # Find first row (either in thead or tbody)\n", - " first_row_match = re.search(r']*>(.*?)
CHAPTER 5Reference Architecture31
Multi-environment view
Development34
Data35
Exploratory data analysis (EDA)35
Project code36
Model training development36
Model validation and deployment development37
Commit code38
Staging39
Data40
Merge code40
Integration tests (CI)40
Merge41
Cut release branch41
Production42
Model training44
Model validation45
Model deployment46
Model Serving48
Inference: batch or streaming48
Lakehouse Monitoring49
Retraining49
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
\n", - " FIGURE#20\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #20\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" in a stylized font accompanied by a red, geometric icon to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 3 Elements (5 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 16)\n", - "

\n", - " \n", - " [59, 61, 347, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 17)\n", - "

\n", - " \n", - " [1634, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " 3\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 18)\n", - "

\n", - " \n", - " [59, 231, 301, 287]\n", - " \n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TABLE (ID: 19)\n", - "

\n", - " \n", - " [484, 256, 1603, 967]\n", - " \n", - "
\n", - "
\n", - "
CHAPTER 5Reference Architecture31
Multi-environment view
Development34
Data35
Exploratory data analysis (EDA)35
Project code36
Model training development36
Model validation and deployment development37
Commit code38
Staging39
Data40
Merge code40
Integration tests (CI)40
Merge41
Cut release branch41
Production42
Model training44
Model validation45
Model deployment46
Model Serving48
Inference: batch or streaming48
Lakehouse Monitoring49
Retraining49
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 20)\n", - "

\n", - " \n", - " [51, 1183, 326, 1227]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" in a stylized font accompanied by a red, geometric icon to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 4: 5 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#21\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #21\n", - "
\n", - "
\n", - " BIG BOOK OF MLOP'S - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#22\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #22\n", - "
\n", - "
\n", - " 4\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#23\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #23\n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#24\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #24\n", - "
\n", - "
\n", - " CHAPTER 6 LLMOps 51
What changes with LLMs? 51
Key components of LLM-powered applications 54
Prompt engineering 54
Leveraging your own data 56
Retrieval augmented generation (RAG) 58
Typical RAG workflow 59
Vector database 60
Benefits of vector databases in a RAG workflow 61
Fine-tuning LLMs 62
When to use fine-tuning? 63
Fine-tuning in practice 63
Pre-training 64
When to use pre-training? 64
Pre-training in practice 65
Third-party APIs vs. self-hosted models 66
Model evaluation 67
LLMs as evalu...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#25\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #25\n", - "
\n", - "
\n", - " Description: The image features the Databricks logo, composed of a red diamond and red chevron shapes alongside the company name in black text.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 4 Elements (5 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 21)\n", - "

\n", - " \n", - " [59, 64, 345, 80]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOP'S - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 22)\n", - "

\n", - " \n", - " [1634, 64, 1651, 76]\n", - " \n", - "
\n", - "
\n", - " 4\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 23)\n", - "

\n", - " \n", - " [59, 240, 297, 287]\n", - " \n", - "
\n", - "
\n", - " Contents\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 24)\n", - "

\n", - " \n", - " [484, 259, 1603, 1153]\n", - " \n", - "
\n", - "
\n", - " CHAPTER 6 LLMOps 51\n", - "What changes with LLMs? 51\n", - "Key components of LLM-powered applications 54\n", - "Prompt engineering 54\n", - "Leveraging your own data 56\n", - "Retrieval augmented generation (RAG) 58\n", - "Typical RAG workflow 59\n", - "Vector database 60\n", - "Benefits of vector databases in a RAG workflow 61\n", - "Fine-tuning LLMs 62\n", - "When to use fine-tuning? 63\n", - "Fine-tuning in practice 63\n", - "Pre-training 64\n", - "When to use pre-training? 64\n", - "Pre-training in practice 65\n", - "Third-party APIs vs. self-hosted models 66\n", - "Model evaluation 67\n", - "LLMs as evaluators 69\n", - "Human feedback in evaluation 69\n", - "Packaging models or pipelines for deployment 70\n", - "LLM Inference 71\n", - "Real-time inference 71\n", - "Batch inference 71\n", - "Inference with large models 72\n", - "Managing cost/performance trade-offs 72\n", - "Methods for reducing costs of inference 73\n", - "Reference architecture 74\n", - "RAG with a third-party LLM API 74\n", - "RAG with a fine-tuned OSS model 75\n", - "\n", - "CHAPTER 7 Conclusion 78\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 25)\n", - "

\n", - " \n", - " [59, 1181, 322, 1226]\n", - " \n", - "
\n", - "
\n", - " Description: The image features the Databricks logo, composed of a red diamond and red chevron shapes alongside the company name in black text.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 5: 9 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#26\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #26\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#27\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #27\n", - "
\n", - "
\n", - " 5\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#28\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #28\n", - "
\n", - "
\n", - " CHAPTER 1
Introduction\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#29\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #29\n", - "
\n", - "
\n", - " Machine learning operations (MLOps) is a rapidly evolving field where building and maintaining robust, flexible and efficient workflows is critical. At Databricks, we view MLOps as the set of processes and automation for managing data, code and models to improve performance stability and long-term efficiency in ML systems.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#30\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #30\n", - "
\n", - "
\n", - " MLOps = DataOps + DevOps + ModelOps\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#31\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #31\n", - "
\n", - "
\n", - " Through this lens, we strive to continuously innovate and advance our product offerings to simplify the ability to build AI-powered solutions on the Lakehouse. We believe there is no greater accelerant to delivering ML to production than building on a unified, data-centric AI platform. On Databricks, both data and models can be managed and governed in a single governance solution in the form of Unity Catalog. The previously complex infrastructure required to serve real-time models can now be rep...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#32\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #32\n", - "
\n", - "
\n", - " Perhaps the most significant recent change in the machine learning landscape has been the rapid advancement of generative AI. Generative models such as large language models (LLMs) and image generation models have revolutionized the field, unlocking previously unattainable levels of natural language and image generation. However, their arrival also introduces a new set of challenges and decisions to be made in the context of MLOps.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#33\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #33\n", - "
\n", - "
\n", - " With all these developments in mind, we're excited to present this updated version of the Big Book of MLOps. This guide incorporates new Databricks features such as Models in Unity Catalog, Model Serving, and Lakehouse Monitoring into our MLOps architecture recommendations. We start by outlining the themes that still remain relevant from the previous version of the Big Book of MLOps. Following this, we unpack the new features introduced in this version, their impact on the previous reference arc...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#34\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #34\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 5 Elements (9 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 26)\n", - "

\n", - " \n", - " [59, 61, 345, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 27)\n", - "

\n", - " \n", - " [1634, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " 5\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 28)\n", - "

\n", - " \n", - " [59, 210, 301, 287]\n", - " \n", - "
\n", - "
\n", - " CHAPTER 1\n", - "Introduction\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 29)\n", - "

\n", - " \n", - " [607, 259, 1622, 374]\n", - " \n", - "
\n", - "
\n", - " Machine learning operations (MLOps) is a rapidly evolving field where building and maintaining robust, flexible and efficient workflows is critical. At Databricks, we view MLOps as the set of processes and automation for managing data, code and models to improve performance stability and long-term efficiency in ML systems.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 30)\n", - "

\n", - " \n", - " [853, 433, 1232, 451]\n", - " \n", - "
\n", - "
\n", - " MLOps = DataOps + DevOps + ModelOps\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 31)\n", - "

\n", - " \n", - " [607, 509, 1649, 750]\n", - " \n", - "
\n", - "
\n", - " Through this lens, we strive to continuously innovate and advance our product offerings to simplify the ability to build AI-powered solutions on the Lakehouse. We believe there is no greater accelerant to delivering ML to production than building on a unified, data-centric AI platform. On Databricks, both data and models can be managed and governed in a single governance solution in the form of Unity Catalog. The previously complex infrastructure required to serve real-time models can now be replaced and easily scaled with Databricks Model Serving. Long-term efficiency and performance stability of ML in production can be achieved using Databricks Lakehouse Monitoring. These components collectively form the data pipelines of an ML solution, all of which can be orchestrated using Databricks Workflows.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 32)\n", - "

\n", - " \n", - " [607, 777, 1647, 893]\n", - " \n", - "
\n", - "
\n", - " Perhaps the most significant recent change in the machine learning landscape has been the rapid advancement of generative AI. Generative models such as large language models (LLMs) and image generation models have revolutionized the field, unlocking previously unattainable levels of natural language and image generation. However, their arrival also introduces a new set of challenges and decisions to be made in the context of MLOps.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 33)\n", - "

\n", - " \n", - " [607, 919, 1651, 1161]\n", - " \n", - "
\n", - "
\n", - " With all these developments in mind, we're excited to present this updated version of the Big Book of MLOps. This guide incorporates new Databricks features such as Models in Unity Catalog, Model Serving, and Lakehouse Monitoring into our MLOps architecture recommendations. We start by outlining the themes that still remain relevant from the previous version of the Big Book of MLOps. Following this, we unpack the new features introduced in this version, their impact on the previous reference architecture, and best practices when incorporating these into your MLOps workflows. Next, we present our updated MLOps reference architecture, along with the details of its processes. Finally, we provide guidance for deploying generative AI applications to production on Databricks, focusing on productionizing LLMs.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 34)\n", - "

\n", - " \n", - " [53, 1180, 324, 1229]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 6: 10 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#35\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #35\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#36\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #36\n", - "
\n", - "
\n", - " No content available\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#37\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #37\n", - "
\n", - "
\n", - " CHAPTER 2 Big Book of MLOps V1 Recap\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#38\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #38\n", - "
\n", - "
\n", - " Description: Three teal hexagons display line-art icons representing people, cylinders, and gears, labeled "ModelOps," "DataOps," and "DevOps."\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#39\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #39\n", - "
\n", - "
\n", - " We begin with a brief recap of the core points discussed in the previous version of the Big Book of MLOps. While the recommended reference architecture has evolved due to new features and product updates, the core themes discussed, such as the importance of MLOps, guiding principles and the fundamentals of MLOps on Databricks, remain pertinent. In this section we focus on summarizing those elements that remain unchanged. For a more in-depth discussion of any of these points, we refer the reader ...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#40\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #40\n", - "
\n", - "
\n", - " Why should I care about MLOps?\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#41\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #41\n", - "
\n", - "
\n", - " We continue to stress the importance of defining an effective MLOps strategy. Databricks customers like CareSource, which has since implemented our recommended MLOps architecture, have witnessed firsthand the value this can bring. Through streamlining the process of delivering models to production, time to business value is accelerated. This efficiency has the knock-on effect of giving data science teams the freedom and confidence to transition to subsequent projects without the need for continu...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#42\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #42\n", - "
\n", - "
\n", - " Guiding principles\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#43\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #43\n", - "
\n", - "
\n", - " One guiding principle that continues to lie at the heart of the Lakehouse AI vision is taking a data-centric approach to machine learning. With the increasing prevalence of generative AI, this perspective remains just as important. The core constituents of any ML project can be viewed simply as data pipelines: feature engineering, training, model deployment, inference and monitoring pipelines are all data pipelines. As such, operationalizing an ML solution requires joining data from predictions,...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#44\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #44\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 6 Elements (10 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 35)\n", - "

\n", - " \n", - " [59, 61, 347, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 36)\n", - "

\n", - " \n", - " [1630, 61, 1643, 77]\n", - " \n", - "
\n", - "
\n", - " No content\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 37)\n", - "

\n", - " \n", - " [59, 210, 415, 339]\n", - " \n", - "
\n", - "
\n", - " CHAPTER 2 Big Book of MLOps V1 Recap\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 38)\n", - "

\n", - " \n", - " [59, 533, 522, 733]\n", - " \n", - "
\n", - "
\n", - " Description: Three teal hexagons display line-art icons representing people, cylinders, and gears, labeled \"ModelOps,\" \"DataOps,\" and \"DevOps.\"\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 39)\n", - "

\n", - " \n", - " [609, 256, 1617, 439]\n", - " \n", - "
\n", - "
\n", - " We begin with a brief recap of the core points discussed in the previous version of the Big Book of MLOps. While the recommended reference architecture has evolved due to new features and product updates, the core themes discussed, such as the importance of MLOps, guiding principles and the fundamentals of MLOps on Databricks, remain pertinent. In this section we focus on summarizing those elements that remain unchanged. For a more in-depth discussion of any of these points, we refer the reader to last year's Big Book of MLOps.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 40)\n", - "

\n", - " \n", - " [609, 479, 1083, 511]\n", - " \n", - "
\n", - "
\n", - " Why should I care about MLOps?\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 41)\n", - "

\n", - " \n", - " [609, 527, 1597, 709]\n", - " \n", - "
\n", - "
\n", - " We continue to stress the importance of defining an effective MLOps strategy. Databricks customers like CareSource, which has since implemented our recommended MLOps architecture, have witnessed firsthand the value this can bring. Through streamlining the process of delivering models to production, time to business value is accelerated. This efficiency has the knock-on effect of giving data science teams the freedom and confidence to transition to subsequent projects without the need for continuous manual oversight of models in production.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 42)\n", - "

\n", - " \n", - " [609, 749, 876, 781]\n", - " \n", - "
\n", - "
\n", - " Guiding principles\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 43)\n", - "

\n", - " \n", - " [609, 799, 1601, 1009]\n", - " \n", - "
\n", - "
\n", - " One guiding principle that continues to lie at the heart of the Lakehouse AI vision is taking a data-centric approach to machine learning. With the increasing prevalence of generative AI, this perspective remains just as important. The core constituents of any ML project can be viewed simply as data pipelines: feature engineering, training, model deployment, inference and monitoring pipelines are all data pipelines. As such, operationalizing an ML solution requires joining data from predictions, monitoring and feature tables with other relevant data. Fundamentally, the simplest way to achieve this is to develop AI-powered solutions on the same platform used to manage production data.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 44)\n", - "

\n", - " \n", - " [53, 1180, 324, 1229]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 7: 9 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#45\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #45\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPs - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#46\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #46\n", - "
\n", - "
\n", - " No content available\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#47\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #47\n", - "
\n", - "
\n", - " Semantics of development, staging and production\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#48\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #48\n", - "
\n", - "
\n", - " Description: Three icons representing "Code," "Data," and "Models" are arranged horizontally with labels below each.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#49\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #49\n", - "
\n", - "
\n", - " Note: Throughout this paper we operate under the assumption of three distinct execution environments — development, staging and production — in the form of three separate Databricks workspaces. There can be variations of these three stages, such as alternative naming conventions or splitting staging into separate “test” and “QA” substages. Although not recommended, it is also possible to create three distinct environments within a single Databricks workspace through the use of access controls an...\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#50\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #50\n", - "
\n", - "
\n", - " An ML solution comprises data, code and models. These assets need to be developed, tested (staging) and deployed (production). For each of these stages, we also need to operate within an execution environment. As such, each of data, code, models and execution environments are notionally divided into development, staging and production.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#51\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #51\n", - "
\n", - "
\n", - " Description: Three colored boxes labeled "Developed," "Tested," and "Deployed" are arranged horizontally with corresponding lowercase labels below and arrows pointing right.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#52\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #52\n", - "
\n", - "
\n", - " Each of these stages has distinct access controls and quality guarantees, ranging from the open and exploratory development stage through to the locked-down and quality-assured production stage.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#53\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #53\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" in a bold, sans-serif font accompanied by a red and gray geometric design to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 7 Elements (9 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 45)\n", - "

\n", - " \n", - " [59, 64, 347, 76]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPs - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 46)\n", - "

\n", - " \n", - " [1634, 64, 1651, 76]\n", - " \n", - "
\n", - "
\n", - " No content\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 47)\n", - "

\n", - " \n", - " [607, 261, 1352, 291]\n", - " \n", - "
\n", - "
\n", - " Semantics of development, staging and production\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 48)\n", - "

\n", - " \n", - " [866, 317, 1329, 484]\n", - " \n", - "
\n", - "
\n", - " Description: Three icons representing \"Code,\" \"Data,\" and \"Models\" are arranged horizontally with labels below each.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 49)\n", - "

\n", - " \n", - " [59, 417, 457, 853]\n", - " \n", - "
\n", - "
\n", - " Note: Throughout this paper we operate under the assumption of three distinct execution environments — development, staging and production — in the form of three separate Databricks workspaces. There can be variations of these three stages, such as alternative naming conventions or splitting staging into separate “test” and “QA” substages. Although not recommended, it is also possible to create three distinct environments within a single Databricks workspace through the use of access controls and Git branches. Regardless of how environment separation is achieved, the core principles of the workflow and recommendations presented are generally applicable.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 50)\n", - "

\n", - " \n", - " [607, 546, 1580, 660]\n", - " \n", - "
\n", - "
\n", - " An ML solution comprises data, code and models. These assets need to be developed, tested (staging) and deployed (production). For each of these stages, we also need to operate within an execution environment. As such, each of data, code, models and execution environments are notionally divided into development, staging and production.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 51)\n", - "

\n", - " \n", - " [786, 704, 1409, 921]\n", - " \n", - "
\n", - "
\n", - " Description: Three colored boxes labeled \"Developed,\" \"Tested,\" and \"Deployed\" are arranged horizontally with corresponding lowercase labels below and arrows pointing right.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 52)\n", - "

\n", - " \n", - " [607, 974, 1544, 1024]\n", - " \n", - "
\n", - "
\n", - " Each of these stages has distinct access controls and quality guarantees, ranging from the open and exploratory development stage through to the locked-down and quality-assured production stage.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 53)\n", - "

\n", - " \n", - " [53, 1180, 324, 1226]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" in a bold, sans-serif font accompanied by a red and gray geometric design to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 8: 11 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#54\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #54\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#55\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #55\n", - "
\n", - "
\n", - " 8\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#56\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #56\n", - "
\n", - "
\n", - " ML deployment patterns\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#57\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #57\n", - "
\n", - "
\n", - " Code and models often progress asynchronously through these stages. Thus, it becomes crucial to leverage a solution that allows for the management of model artifacts independently of code, making it possible to update a production model without necessarily making a code change. Data, much like code and models, can be labeled as development, staging or production, indicating not only its origin but also its quality and reliability.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#58\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #58\n", - "
\n", - "
\n", - " Given the independent lifecycles of code and models, there are two opposing strategies to moving code and ML models from development, through staging and subsequently to production:\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#59\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #59\n", - "
\n", - "
\n", - " DEPLOY CODE\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#60\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #60\n", - "
\n", - "
\n", - " Description: Three rectangular boxes, labeled "dev," "staging," and "prod," illustrate a sequential process with arrows and icons representing code and models.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#61\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #61\n", - "
\n", - "
\n", - " - Code for an ML project is developed in the development environment, and this code is then moved to the staging environment, where it is tested. Following successful testing, the project code is deployed to the production environment, where it is executed.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#62\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #62\n", - "
\n", - "
\n", - " - Model training code is tested in the staging environment using a subset of data, and the model training pipeline is executed in the production environment\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#63\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #63\n", - "
\n", - "
\n", - " - The model deployment process of validating a model and additionally conducting comparisons versus any existing production model all run within the production environment\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#64\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #64\n", - "
\n", - "
\n", - " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 8 Elements (11 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 54)\n", - "

\n", - " \n", - " [59, 61, 345, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 55)\n", - "

\n", - " \n", - " [1634, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " 8\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 56)\n", - "

\n", - " \n", - " [607, 259, 969, 293]\n", - " \n", - "
\n", - "
\n", - " ML deployment patterns\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 57)\n", - "

\n", - " \n", - " [607, 309, 1578, 456]\n", - " \n", - "
\n", - "
\n", - " Code and models often progress asynchronously through these stages. Thus, it becomes crucial to leverage a solution that allows for the management of model artifacts independently of code, making it possible to update a production model without necessarily making a code change. Data, much like code and models, can be labeled as development, staging or production, indicating not only its origin but also its quality and reliability.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 58)\n", - "

\n", - " \n", - " [607, 481, 1580, 534]\n", - " \n", - "
\n", - "
\n", - " Given the independent lifecycles of code and models, there are two opposing strategies to moving code and ML models from development, through staging and subsequently to production:\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 59)\n", - "

\n", - " \n", - " [607, 604, 741, 620]\n", - " \n", - "
\n", - "
\n", - " DEPLOY CODE\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 60)\n", - "

\n", - " \n", - " [605, 620, 1554, 826]\n", - " \n", - "
\n", - "
\n", - " Description: Three rectangular boxes, labeled \"dev,\" \"staging,\" and \"prod,\" illustrate a sequential process with arrows and icons representing code and models.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 61)\n", - "

\n", - " \n", - " [638, 899, 1527, 981]\n", - " \n", - "
\n", - "
\n", - " - Code for an ML project is developed in the development environment, and this code is then moved to the staging environment, where it is tested. Following successful testing, the project code is deployed to the production environment, where it is executed.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 62)\n", - "

\n", - " \n", - " [638, 1007, 1540, 1060]\n", - " \n", - "
\n", - "
\n", - " - Model training code is tested in the staging environment using a subset of data, and the model training pipeline is executed in the production environment\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 63)\n", - "

\n", - " \n", - " [638, 1087, 1544, 1139]\n", - " \n", - "
\n", - "
\n", - " - The model deployment process of validating a model and additionally conducting comparisons versus any existing production model all run within the production environment\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 64)\n", - "

\n", - " \n", - " [53, 1180, 324, 1229]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 9: 8 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#65\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #65\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#66\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #66\n", - "
\n", - "
\n", - " No content available\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#67\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #67\n", - "
\n", - "
\n", - " Description: Three boxes labeled "dev," "staging," and "prod" display brain icons with code symbols, connected by dashed arrows.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#68\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #68\n", - "
\n", - "
\n", - " - Model training is executed in the development environment. The produced model artifact is then moved to the staging environment for model validation checks, prior to deployment of the model to the production environment.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#69\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #69\n", - "
\n", - "
\n", - " - This approach requires a separate path for deploying ancillary code such as inference and monitoring code. Subsequently, any pipelines that need to run in the production environment to support the operationalization of the model will necessarily need to go through a separate “deploy code” lifecycle — the code for these components being tested in staging and then deployed to production.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#70\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #70\n", - "
\n", - "
\n", - " - This pattern is typically used when deploying a one-off model, or when model training is expensive and read-access to production data from the development environment is possible\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#71\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #71\n", - "
\n", - "
\n", - " As in our prior paper, we recommend a deploy code approach for the majority of use cases, and the reference architecture presented in this update continues to follow this recommendation.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#72\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #72\n", - "
\n", - "
\n", - " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 9 Elements (8 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 65)\n", - "

\n", - " \n", - " [59, 61, 345, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 66)\n", - "

\n", - " \n", - " [1634, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " No content\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 67)\n", - "

\n", - " \n", - " [602, 253, 1405, 481]\n", - " \n", - "
\n", - "
\n", - " Description: Three boxes labeled \"dev,\" \"staging,\" and \"prod\" display brain icons with code symbols, connected by dashed arrows.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 68)\n", - "

\n", - " \n", - " [640, 530, 1584, 616]\n", - " \n", - "
\n", - "
\n", - " - Model training is executed in the development environment. The produced model artifact is then moved to the staging environment for model validation checks, prior to deployment of the model to the production environment.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 69)\n", - "

\n", - " \n", - " [640, 640, 1573, 789]\n", - " \n", - "
\n", - "
\n", - " - This approach requires a separate path for deploying ancillary code such as inference and monitoring code. Subsequently, any pipelines that need to run in the production environment to support the operationalization of the model will necessarily need to go through a separate “deploy code” lifecycle — the code for these components being tested in staging and then deployed to production.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 70)\n", - "

\n", - " \n", - " [640, 811, 1578, 867]\n", - " \n", - "
\n", - "
\n", - " - This pattern is typically used when deploying a one-off model, or when model training is expensive and read-access to production data from the development environment is possible\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 71)\n", - "

\n", - " \n", - " [607, 907, 1544, 960]\n", - " \n", - "
\n", - "
\n", - " As in our prior paper, we recommend a deploy code approach for the majority of use cases, and the reference architecture presented in this update continues to follow this recommendation.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 72)\n", - "

\n", - " \n", - " [53, 1180, 324, 1229]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the Databricks name in bold text accompanied by a stylized red and gray graphic to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "
\n", - " \n", - "
\n", - " Page 10: 11 elements
\n", - " Original size: 1706×1280px | \n", - " Display size: 1024×768px | \n", - " Scale factor: 0.600
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \"Page\n", - " \n", - "
\n", - "
\n", - " PAGE_H#73\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #73\n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_H#74\n", - "
\n", - " \n", - "
\n", - "
\n", - " PAGE_HEADER #74\n", - "
\n", - "
\n", - " 10\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#75\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #75\n", - "
\n", - "
\n", - " CHAPTER 3 What's New?\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#76\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #76\n", - "
\n", - "
\n", - " In this section we outline the key features and product updates introduced into our updated MLOps reference architecture. For each of these, we highlight the benefits they bring and how they impact our end-to-end MLOps workflow.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTIO#77\n", - "
\n", - " \n", - "
\n", - "
\n", - " SECTION_HEADER #77\n", - "
\n", - "
\n", - " Unity Catalog\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#78\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #78\n", - "
\n", - "
\n", - " The Lakehouse forms the foundation of a data-centric AI platform. Key to this is the ability to manage both data and AI assets from a unified governance solution on the Lakehouse. Databricks Unity Catalog enables this by providing centralized access control, auditing, lineage, and data discovery capabilities across Databricks workspaces.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#79\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #79\n", - "
\n", - "
\n", - " These benefits are now extended to MLflow models with the introduction of Models in Unity Catalog. By providing a hosted version of the MLflow Model Registry in Unity Catalog, the full lifecycle of an ML model can be managed while leveraging Unity Catalog's capability to share assets across Databricks workspaces and trace lineage across both data and models.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT#80\n", - "
\n", - " \n", - "
\n", - "
\n", - " TEXT #80\n", - "
\n", - "
\n", - " In addition to managing ML models, feature tables are also a part of Unity Catalog. With Feature Engineering in Unity Catalog, any Delta table in Unity Catalog that has been assigned a primary key (and additionally a timestamp key) can be used as a source of features to train and serve models. Furthermore, feature tables can now also be shared across different workspaces, and lineage recorded between other assets in the Lakehouse.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#81\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #81\n", - "
\n", - "
\n", - " Description: A diagram illustrates a data processing flow with icons representing volumes, tables, features, models, inference tables, and metric tables, connected by arrows.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " CAPTIO#82\n", - "
\n", - " \n", - "
\n", - "
\n", - " CAPTION #82\n", - "
\n", - "
\n", - " Assets of an ML workflow, all managed via Unity Catalog\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE#83\n", - "
\n", - " \n", - "
\n", - "
\n", - " FIGURE #83\n", - "
\n", - "
\n", - " Description: The logo features the word "databricks" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "
\n", - "

\uD83D\uDCCB Page 10 Elements (11 items)

\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 73)\n", - "

\n", - " \n", - " [59, 61, 345, 77]\n", - " \n", - "
\n", - "
\n", - " BIG BOOK OF MLOPS - 2ND EDITION\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " PAGE HEADER (ID: 74)\n", - "

\n", - " \n", - " [1624, 61, 1647, 77]\n", - " \n", - "
\n", - "
\n", - " 10\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 75)\n", - "

\n", - " \n", - " [59, 210, 307, 283]\n", - " \n", - "
\n", - "
\n", - " CHAPTER 3 What's New?\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 76)\n", - "

\n", - " \n", - " [607, 257, 1567, 343]\n", - " \n", - "
\n", - "
\n", - " In this section we outline the key features and product updates introduced into our updated MLOps reference architecture. For each of these, we highlight the benefits they bring and how they impact our end-to-end MLOps workflow.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " SECTION HEADER (ID: 77)\n", - "

\n", - " \n", - " [607, 384, 805, 417]\n", - " \n", - "
\n", - "
\n", - " Unity Catalog\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 78)\n", - "

\n", - " \n", - " [607, 433, 1569, 550]\n", - " \n", - "
\n", - "
\n", - " The Lakehouse forms the foundation of a data-centric AI platform. Key to this is the ability to manage both data and AI assets from a unified governance solution on the Lakehouse. Databricks Unity Catalog enables this by providing centralized access control, auditing, lineage, and data discovery capabilities across Databricks workspaces.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 79)\n", - "

\n", - " \n", - " [607, 574, 1554, 691]\n", - " \n", - "
\n", - "
\n", - " These benefits are now extended to MLflow models with the introduction of Models in Unity Catalog. By providing a hosted version of the MLflow Model Registry in Unity Catalog, the full lifecycle of an ML model can be managed while leveraging Unity Catalog's capability to share assets across Databricks workspaces and trace lineage across both data and models.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " TEXT (ID: 80)\n", - "

\n", - " \n", - " [607, 716, 1567, 864]\n", - " \n", - "
\n", - "
\n", - " In addition to managing ML models, feature tables are also a part of Unity Catalog. With Feature Engineering in Unity Catalog, any Delta table in Unity Catalog that has been assigned a primary key (and additionally a timestamp key) can be used as a source of features to train and serve models. Furthermore, feature tables can now also be shared across different workspaces, and lineage recorded between other assets in the Lakehouse.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 81)\n", - "

\n", - " \n", - " [605, 893, 1603, 1139]\n", - " \n", - "
\n", - "
\n", - " Description: A diagram illustrates a data processing flow with icons representing volumes, tables, features, models, inference tables, and metric tables, connected by arrows.\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " CAPTION (ID: 82)\n", - "

\n", - " \n", - " [607, 1157, 992, 1174]\n", - " \n", - "
\n", - "
\n", - " Assets of an ML workflow, all managed via Unity Catalog\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "

\n", - " FIGURE (ID: 83)\n", - "

\n", - " \n", - " [53, 1180, 324, 1229]\n", - " \n", - "
\n", - "
\n", - " Description: The logo features the word \"databricks\" in bold, dark blue text accompanied by a stylized red and white graphic to the left.\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "for parsed_result in parsed_results:\n", - " render_ai_parse_output(parsed_result, page_selection)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": { - "hardware": { - "accelerator": null, - "gpuPoolId": null, - "memory": null - } - }, - "dashboards": [], - "environmentMetadata": { - "base_environment": "", - "environment_version": "3" - }, - "inputWidgetPreferences": null, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "ai_parse_document -- debug output", - "widgets": { - "image_output_path": { - "currentValue": "/Volumes/users/jas_bali/pdfs_for_bricks/", - "nuid": "b024b356-873a-4d79-9498-c33aa08ab48a", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "/Volumes/main/default/parsed_output/", - "label": null, - "name": "image_output_path", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "/Volumes/main/default/parsed_output/", - "label": null, - "name": "image_output_path", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - }, - "input_file": { - "currentValue": "/Volumes/users/jas_bali/pdfs_ie/2023-10-EB-Big-Book-of-MLOps-2nd-Edition.pdf", - "nuid": "c8c4fa93-30c4-4462-b585-256a6b1d60d2", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "/Volumes/main/default/source_documents/sample.pdf", - "label": null, - "name": "input_file", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "/Volumes/main/default/source_documents/sample.pdf", - "label": null, - "name": "input_file", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - }, - "page_selection": { - "currentValue": "1-10", - "nuid": "cd86904e-36de-4980-a660-304943552e8d", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "all", - "label": null, - "name": "page_selection", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "all", - "label": null, - "name": "page_selection", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - } - } - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py new file mode 100644 index 00000000..2f39afab --- /dev/null +++ b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py @@ -0,0 +1,782 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # 🔍 AI Parse Document Debug Interface +# MAGIC +# MAGIC Version 1.3 +# MAGIC +# MAGIC Last update: Oct 6, 2025 +# MAGIC +# MAGIC Changelog: +# MAGIC - Simplified widget parameters: `input_file` and `image_output_path` now accept full volume paths +# MAGIC - Removed separate `catalog`, `schema`, `volume` widgets +# MAGIC - `input_file` supports wildcards for processing multiple files (e.g., `/Volumes/catalog/schema/volume/input/*`) +# MAGIC +# MAGIC ## Overview +# MAGIC This notebook provides a **visual debugging interface** for analyzing the output of Databricks' `ai_parse_document` function. It renders parsed documents with interactive bounding box overlays, allowing you to inspect what content was extracted from each region of your documents. +# MAGIC +# MAGIC ## Features +# MAGIC - 📊 **Visual Bounding Boxes**: Color-coded overlays showing the exact regions where text/elements were detected +# MAGIC - 🎯 **Interactive Tooltips**: Hover over any bounding box to see the parsed content from that region +# MAGIC - 📐 **Automatic Scaling**: Large documents are automatically scaled to fit within 1024px width for optimal viewing +# MAGIC - 🎨 **Element Type Visualization**: Different colors for different element types (text, headers, tables, figures, etc.) +# MAGIC +# MAGIC ## Required Parameters +# MAGIC +# MAGIC This interface requires widget parameters to be configured before running: +# MAGIC +# MAGIC ### 1. `input_file` +# MAGIC - **Description**: Full Unity Catalog volume path to the document(s) you want to parse and visualize +# MAGIC - **Examples**: +# MAGIC - Single file: `/Volumes/catalog/schema/volume/input/document.pdf` +# MAGIC - All files in directory: `/Volumes/catalog/schema/volume/input/*` +# MAGIC - Pattern matching: `/Volumes/catalog/schema/volume/input/*.pdf` +# MAGIC - **Requirements**: Read access to the volume containing your PDF/image files +# MAGIC +# MAGIC ### 2. `image_output_path` +# MAGIC - **Description**: Full Unity Catalog volume path where `ai_parse_document` will store the extracted page images +# MAGIC - **Example**: `/Volumes/catalog/schema/volume/output/` +# MAGIC - **Requirements**: Write access required for storing intermediate image outputs +# MAGIC - **Note**: As documented in the [official Databricks documentation](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document), this path is used by the parsing function to store page images that are referenced in the output +# MAGIC +# MAGIC ### 3. `page_selection` +# MAGIC - **Description**: Specifies which pages to display in the visualization +# MAGIC - **Supported formats**: +# MAGIC - `"all"` or leave empty: Display all pages +# MAGIC - `"3"`: Display only page 3 (1-indexed) +# MAGIC - `"1-5"`: Display pages 1 through 5 (inclusive, 1-indexed) +# MAGIC - `"1,3,5"`: Display specific pages (1-indexed) +# MAGIC - `"1-3,7,10-12"`: Mixed ranges and individual pages +# MAGIC +# MAGIC ## Usage Instructions +# MAGIC +# MAGIC 1. **Clone this notebook** to your workspace: +# MAGIC - Select **"File -> Clone"** button in the top toolbar +# MAGIC - Choose your desired location in your workspace +# MAGIC - This ensures you have a personal copy you can modify and run +# MAGIC +# MAGIC 2. **Prepare your Unity Catalog volumes**: +# MAGIC - Create or identify a volume for your PDF/image files +# MAGIC - Create or identify a volume for output images +# MAGIC - Upload your PDF files to the input location +# MAGIC +# MAGIC 3. **Configure the widget parameters** at the top of this notebook: +# MAGIC - Set `input_file` to the full volume path (file or directory with wildcard) +# MAGIC - Set `image_output_path` to the full volume path for outputs +# MAGIC - Set `page_selection` to control which pages to visualize +# MAGIC +# MAGIC 4. **Run all code cells** which will generate visual debugging results. +# MAGIC +# MAGIC ## What You'll See +# MAGIC +# MAGIC - **Document Summary**: Overview of pages, element counts, and document metadata +# MAGIC - **Color Legend**: Visual guide showing which colors represent which element types +# MAGIC - **Annotated Images**: Each page with overlaid bounding boxes +# MAGIC - Hover over any box to see the extracted content +# MAGIC - Yellow highlight indicates the currently hovered element +# MAGIC - **Parsed Elements List**: Complete list of all extracted elements with their content + +# COMMAND ---------- + +# Exec Parameters + +dbutils.widgets.text("input_file", "/Volumes/main/default/source_documents/sample.pdf") +dbutils.widgets.text("image_output_path", "/Volumes/main/default/parsed_output/") +dbutils.widgets.text("page_selection", "all") + +input_file = dbutils.widgets.get("input_file") +image_output_path = dbutils.widgets.get("image_output_path") +page_selection = dbutils.widgets.get("page_selection") + +# COMMAND ---------- + +# DBTITLE 1,Configuration Parameters +# Path configuration - use widget values as-is + +source_files = input_file + +# Parse page selection string and return list of page indices to display. +# +# Supported formats: +# - "all" or None: Display all pages +# - "3": Display specific page (1-indexed) +# - "1-5": Display page range (inclusive, 1-indexed) +# - "1,3,5": Display list of specific pages (1-indexed) +# - "1-3,7,10-12": Mixed ranges and individual pages +page_selection = f"{page_selection}" + +# COMMAND ---------- + +# DBTITLE 1,Run Document Parse Code (may take some time) +# SQL statement with ai_parse_document() +# Note: input_file can be a single file path or a directory path with wildcard +sql = f''' +with parsed_documents AS ( + SELECT + path, + ai_parse_document(content + , + map( + 'version', '2.0', + 'imageOutputPath', '{image_output_path}', + 'descriptionElementTypes', '*' + ) + ) as parsed + FROM + read_files('{source_files}', format => 'binaryFile') +) +select * from parsed_documents +''' + +parsed_results = [row.parsed for row in spark.sql(sql).collect()] + +# COMMAND ---------- + +import json +from typing import Dict, List, Any, Optional, Tuple, Set, Union +from IPython.display import HTML, display +import base64 +import os +from PIL import Image +import io + +class DocumentRenderer: + def __init__(self): + # Color mapping for different element types + self.element_colors = { + 'section_header': '#FF6B6B', + 'text': '#4ECDC4', + 'figure': '#45B7D1', + 'caption': '#96CEB4', + 'page_footer': '#FFEAA7', + 'page_header': '#DDA0DD', + 'table': '#98D8C8', + 'list': '#F7DC6F', + 'default': '#BDC3C7' + } + + def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]: + """Parse page selection string and return set of page indices (0-based). + + Args: + page_selection: Selection string or None + total_pages: Total number of pages available + + Returns: + Set of 0-based page indices to display + """ + # Handle None or "all" - return all pages + if page_selection is None or page_selection.lower() == "all": + return set(range(total_pages)) + + selected_pages = set() + + # Clean the input + page_selection = page_selection.strip() + + # Split by commas for multiple selections + parts = page_selection.split(',') + + for part in parts: + part = part.strip() + + # Check if it's a range (contains hyphen) + if '-' in part: + try: + # Split range and convert to integers + range_parts = part.split('-') + if len(range_parts) == 2: + start = int(range_parts[0].strip()) + end = int(range_parts[1].strip()) + + # Convert from 1-indexed to 0-indexed + start_idx = start - 1 + end_idx = end - 1 + + # Add all pages in range (inclusive) + for i in range(start_idx, end_idx + 1): + if 0 <= i < total_pages: + selected_pages.add(i) + except ValueError: + print(f"Warning: Invalid range '{part}' in page selection") + else: + # Single page number + try: + page_num = int(part.strip()) + # Convert from 1-indexed to 0-indexed + page_idx = page_num - 1 + if 0 <= page_idx < total_pages: + selected_pages.add(page_idx) + else: + print(f"Warning: Page {page_num} is out of range (1-{total_pages})") + except ValueError: + print(f"Warning: Invalid page number '{part}' in page selection") + + # If no valid pages were selected, default to all pages + if not selected_pages: + print(f"Warning: No valid pages in selection '{page_selection}'. Showing all pages.") + return set(range(total_pages)) + + return selected_pages + + def _get_element_color(self, element_type: str) -> str: + """Get color for element type.""" + return self.element_colors.get(element_type.lower(), self.element_colors['default']) + + def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]: + """Get dimensions of an image file.""" + try: + if os.path.exists(image_path): + with Image.open(image_path) as img: + return img.size # Returns (width, height) + return None + except Exception as e: + print(f"Error getting image dimensions for {image_path}: {e}") + return None + + def _load_image_as_base64(self, image_path: str) -> Optional[str]: + """Load image from file path and convert to base64.""" + try: + if os.path.exists(image_path): + with open(image_path, 'rb') as img_file: + img_data = img_file.read() + img_base64 = base64.b64encode(img_data).decode('utf-8') + ext = os.path.splitext(image_path)[1].lower() + if ext in ['.jpg', '.jpeg']: + return f"data:image/jpeg;base64,{img_base64}" + elif ext in ['.png']: + return f"data:image/png;base64,{img_base64}" + else: + return f"data:image/jpeg;base64,{img_base64}" + return None + except Exception as e: + print(f"Error loading image {image_path}: {e}") + return None + + def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str: + """Render element content with appropriate formatting for both tooltip and element list display. + + Args: + element: The element dictionary containing content/description + for_tooltip: Whether this is for tooltip display (affects styling and truncation) + """ + element_type = element.get('type', 'unknown') + content = element.get('content', '') + description = element.get('description', '') + + display_content = "" + + if content: + if element_type == 'table': + # Render the HTML table with styling + table_html = content + + # Apply different styling based on context + if for_tooltip: + # Compact styling for tooltips with light theme + # Use full width available for tooltip tables + table_style = f'''style="width: 100%; border-collapse: collapse; margin: 5px 0; font-size: 10px;"''' + th_style = 'style="border: 1px solid #ddd; padding: 4px; background: #f8f9fa; color: #333; font-weight: bold; text-align: left; font-size: 10px;"' + td_style = 'style="border: 1px solid #ddd; padding: 4px; color: #333; font-size: 10px;"' + thead_style = 'style="background: #e9ecef;"' + else: + # Full styling for element list + table_style = '''style="width: 100%; border-collapse: collapse; margin: 10px 0; font-size: 13px;"''' + th_style = 'style="border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;"' + td_style = 'style="border: 1px solid #ddd; padding: 8px;"' + thead_style = 'style="background: #f0f0f0;"' + + # Apply styling transformations + if '' in table_html: + table_html = table_html.replace('
', f'
') + if '' in table_html: + table_html = table_html.replace('', f'') + + if for_tooltip: + display_content = table_html + else: + display_content = f"
{table_html}
" + else: + # Regular content handling + if for_tooltip and len(content) > 500: + # Truncate for tooltip display and escape HTML for safety + display_content = self._escape_for_html_attribute(content[:500] + "...") + else: + display_content = self._escape_for_html_attribute(content) if for_tooltip else content + elif description: + desc_content = description + if for_tooltip and len(desc_content) > 500: + desc_content = desc_content[:500] + "..." + + if for_tooltip: + display_content = self._escape_for_html_attribute(f"Description: {desc_content}") + else: + display_content = f"Description: {desc_content}" + else: + display_content = "No content available" if for_tooltip else "No content" + + return display_content + + def _escape_for_html_attribute(self, text: str) -> str: + """Escape text for safe use in HTML attributes.""" + return (text.replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + .replace('\n', '
')) + + def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int: + """Calculate dynamic tooltip width based on table content.""" + element_type = element.get('type', 'unknown') + content = element.get('content', '') + + if element_type == 'table' and content: + # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE) + if first_row_match: + first_row = first_row_match.group(1) + # Count th or td tags + th_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) + td_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) + column_count = max(th_count, td_count) + + if column_count > 0: + # Base width + additional width per column + base_width = 300 + width_per_column = 80 + calculated_width = base_width + (column_count * width_per_column) + + # Cap at 4/5th of image width + max_width = int(image_width * 0.8) + return min(calculated_width, max_width) + + # Default width for non-tables or when calculation fails + return 400 + + def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: + """Create annotated image with SCALING to fit within 1024px width.""" + image_uri = page.get('image_uri', '') + page_id = page.get('id', 0) + + if not image_uri: + return "

No image URI found for this page

" + + # Load image + img_data_uri = self._load_image_as_base64(image_uri) + if not img_data_uri: + return f""" +
+ Could not load image: {image_uri}
+ Make sure the file exists and is accessible. +
+ """ + + # Get original image dimensions + original_dimensions = self._get_image_dimensions(image_uri) + if not original_dimensions: + # Fallback: display without explicit scaling + original_width, original_height = 1024, 768 # Default fallback + else: + original_width, original_height = original_dimensions + + # Calculate scaling factor to fit within 1024px width + max_display_width = 1024 + scale_factor = 1.0 + display_width = original_width + display_height = original_height + + if original_width > max_display_width: + scale_factor = max_display_width / original_width + display_width = max_display_width + display_height = int(original_height * scale_factor) + + # Filter elements for this page and collect their bounding boxes + page_elements = [] + + for elem in elements: + elem_bboxes = [] + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) == page_id: + coord = bbox.get('coord', []) + if len(coord) >= 4: + elem_bboxes.append(bbox) + + if elem_bboxes: + page_elements.append({ + 'element': elem, + 'bboxes': elem_bboxes + }) + + if not page_elements: + return f"

No elements found for page {page_id}

" + + header_info = f""" +
+ Page {page_id + 1}: {len(page_elements)} elements
+ Original size: {original_width}×{original_height}px | + Display size: {display_width}×{display_height}px | + Scale factor: {scale_factor:.3f}
+
+ """ + + # Generate unique container ID for this page + container_id = f"page_container_{page_id}_{id(self)}" + + # Create bounding box overlays using SCALED coordinates with hover functionality + overlays = [] + + for idx, item in enumerate(page_elements): + element = item['element'] + element_id = element.get('id', 'N/A') + element_type = element.get('type', 'unknown') + color = self._get_element_color(element_type) + + # Use the shared content renderer for tooltip + tooltip_content = self._render_element_content(element, for_tooltip=True) + + # Calculate dynamic tooltip width + tooltip_width = self._calculate_tooltip_width(element, display_width) + + # Tables should render as HTML, other content should be escaped + + for bbox_idx, bbox in enumerate(item['bboxes']): + coord = bbox.get('coord', []) + if len(coord) >= 4: + x1, y1, x2, y2 = coord + + # Apply scaling to coordinates + scaled_x1 = x1 * scale_factor + scaled_y1 = y1 * scale_factor + scaled_x2 = x2 * scale_factor + scaled_y2 = y2 * scale_factor + + width = scaled_x2 - scaled_x1 + height = scaled_y2 - scaled_y1 + + # Skip invalid boxes + if width <= 0 or height <= 0: + continue + + # Position label above box when possible + label_top = -18 if scaled_y1 >= 18 else 2 + + # Unique ID for this bounding box + box_id = f"bbox_{page_id}_{idx}_{bbox_idx}" + + # Calculate tooltip position (prefer right side, but switch to left if needed) + tooltip_left = 10 + + overlay = f""" +
+
+ {element_type.upper()[:6]}#{element_id} +
+ +
+
+ {element_type.upper()} #{element_id} +
+
+ {tooltip_content} +
+
+
+ """ + overlays.append(overlay) + + # Pure CSS hover functionality (works in Databricks) + styles = f""" + + """ + + return f""" + {header_info} + {styles} +
+ Page {page_id + 1} + {''.join(overlays)} +
+ """ + + def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str: + """Create a detailed list of elements for a specific page.""" + # Filter elements for this page + page_elements = [] + + for elem in elements: + elem_bboxes = [] + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) == page_id: + elem_bboxes.append(bbox) + + if elem_bboxes: + page_elements.append(elem) + + if not page_elements: + return f"

No elements found for page {page_id + 1}

" + + html_parts = [] + + for element in page_elements: + element_id = element.get('id', 'N/A') + element_type = element.get('type', 'unknown') + color = self._get_element_color(element_type) + + # Get bounding box info for this page only + bbox_info = "No bbox" + bbox_list = element.get('bbox', []) + if bbox_list: + bbox_details = [] + for bbox in bbox_list: + if bbox.get('page_id', 0) == page_id: + coord = bbox.get('coord', []) + if len(coord) >= 4: + bbox_details.append(f"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]") + bbox_info = "; ".join(bbox_details) if bbox_details else "Invalid bbox" + + # Use the shared content renderer for element list display + display_content = self._render_element_content(element, for_tooltip=False) + + element_html = f""" +
+
+

+ {element_type.upper().replace('_', ' ')} (ID: {element_id}) +

+ + {bbox_info} + +
+
+ {display_content} +
+
+ """ + html_parts.append(element_html) + + return f""" +
+

📋 Page {page_id + 1} Elements ({len(page_elements)} items)

+ {''.join(html_parts)} +
+ """ + + def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str: + """Create a summary with page selection info.""" + elements = document.get('elements', []) + + # Count elements only on selected pages + selected_elements = [] + for elem in elements: + for bbox in elem.get('bbox', []): + if bbox.get('page_id', 0) in selected_pages: + selected_elements.append(elem) + break + + # Count by type (for selected pages) + type_counts = {} + for elem in selected_elements: + elem_type = elem.get('type', 'unknown') + type_counts[elem_type] = type_counts.get(elem_type, 0) + 1 + + type_list = ', '.join([f"{t}: {c}" for t, c in type_counts.items()]) + + # Create page selection info + if len(selected_pages) == total_pages: + page_info = f"All {total_pages} pages" + else: + # Convert to 1-indexed for display + page_nums = sorted([p + 1 for p in selected_pages]) + if len(page_nums) <= 10: + page_info = f"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})" + else: + page_info = f"{len(selected_pages)} of {total_pages} pages selected" + + return f""" +
+

📄 Document Summary

+

Displaying: {page_info}

+

Elements on selected pages: {len(selected_elements)}

+

Element Types: {type_list if type_list else 'None'}

+

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

+
+ """ + + def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None: + """Main render function with page selection support. + + Args: + parsed_result: The parsed document result + page_selection: Page selection string. Supported formats: + - "all" or None: Display all pages + - "3": Display only page 3 (1-indexed) + - "1-5": Display pages 1 through 5 (inclusive) + - "1,3,5": Display specific pages + - "1-3,7,10-12": Mixed format + """ + try: + # Convert to dict + if hasattr(parsed_result, 'toPython'): + parsed_dict = parsed_result.toPython() + elif hasattr(parsed_result, 'toJson'): + parsed_dict = json.loads(parsed_result.toJson()) + elif isinstance(parsed_result, dict): + parsed_dict = parsed_result + else: + display(HTML(f"

❌ Could not convert result. Type: {type(parsed_result)}

")) + return + + # Extract components + document = parsed_dict.get('document', {}) + pages = document.get('pages', []) + elements = document.get('elements', []) + metadata = parsed_dict.get('metadata', {}) + + if not elements: + display(HTML("

❌ No elements found in document

")) + return + + # Parse page selection + selected_pages = self._parse_page_selection(page_selection, len(pages)) + + # Display title + display(HTML("

🔍 AI Parse Document Results

")) + + # Display summary with page selection info + summary_html = self._create_summary(document, metadata, selected_pages, len(pages)) + display(HTML(summary_html)) + + # Display color legend + legend_items = [] + for elem_type, color in self.element_colors.items(): + if elem_type != 'default': + legend_items.append(f""" + + + {elem_type.replace('_', ' ').title()} + + """) + + display(HTML(f""" +
+ 🎨 Element Colors:
+ {''.join(legend_items)} +
+ """)) + + # Display annotated images with their corresponding elements (filtered by selection) + if pages: + display(HTML("

🖼️ Annotated Images & Elements

")) + + # Sort selected pages for display + sorted_selected = sorted(selected_pages) + + for page_idx in sorted_selected: + if page_idx < len(pages): + page = pages[page_idx] + + # Display the annotated image + annotated_html = self._create_annotated_image(page, elements) + display(HTML(f"
{annotated_html}
")) + + # Display elements for this page immediately after the image + page_id = page.get('id', page_idx) + page_elements_html = self._create_page_elements_list(page_id, elements) + display(HTML(page_elements_html)) + + except Exception as e: + display(HTML(f"

❌ Error: {str(e)}

")) + import traceback + display(HTML(f"
{traceback.format_exc()}
")) + + +# Simple usage functions +def render_ai_parse_output(parsed_result, page_selection=None): + """Simple function to render ai_parse_document output with page selection. + + Args: + parsed_result: The parsed document result + page_selection: Optional page selection string. Examples: + - None or "all": Display all pages + - "3": Display only page 3 + - "1-5": Display pages 1 through 5 + - "1,3,5": Display specific pages + - "1-3,7,10-12": Mixed format + """ + renderer = DocumentRenderer() + renderer.render_document(parsed_result, page_selection) + +# COMMAND ---------- + +# DBTITLE 1,Debug Visualization Results +for parsed_result in parsed_results: + render_ai_parse_output(parsed_result, page_selection) \ No newline at end of file From 8578b05bedc4cedadc4e1e6f71b380d18598bd71 Mon Sep 17 00:00:00 2001 From: "jas.bali" Date: Mon, 13 Oct 2025 19:43:38 -0400 Subject: [PATCH 4/6] Update HTML output with latest visualization --- .../src/explorations/ai_parse_document -- debug output.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html index ebe618d8..b8e41dd8 100644 --- a/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html +++ b/knowledge_base/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.html @@ -13,7 +13,7 @@ - + - - - - - - - - \ No newline at end of file From febd50c4c91ff0737358f2a7e42a0f6d205a72e7 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Wed, 15 Oct 2025 13:10:13 +0200 Subject: [PATCH 6/6] Run ruff format --- .../ai_parse_document -- debug output.py | 428 ++++++++++-------- .../src/transformations/01_parse_documents.py | 54 ++- .../src/transformations/02_extract_text.py | 45 +- .../03_extract_structured_data.py | 36 +- 4 files changed, 315 insertions(+), 248 deletions(-) diff --git a/contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py b/contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py index 2f39afab..9f3f507d 100644 --- a/contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py +++ b/contrib/workflow_with_ai_parse_document/src/explorations/ai_parse_document -- debug output.py @@ -109,7 +109,7 @@ # DBTITLE 1,Run Document Parse Code (may take some time) # SQL statement with ai_parse_document() # Note: input_file can be a single file path or a directory path with wildcard -sql = f''' +sql = f""" with parsed_documents AS ( SELECT path, @@ -125,7 +125,7 @@ read_files('{source_files}', format => 'binaryFile') ) select * from parsed_documents -''' +""" parsed_results = [row.parsed for row in spark.sql(sql).collect()] @@ -139,59 +139,62 @@ from PIL import Image import io + class DocumentRenderer: def __init__(self): # Color mapping for different element types self.element_colors = { - 'section_header': '#FF6B6B', - 'text': '#4ECDC4', - 'figure': '#45B7D1', - 'caption': '#96CEB4', - 'page_footer': '#FFEAA7', - 'page_header': '#DDA0DD', - 'table': '#98D8C8', - 'list': '#F7DC6F', - 'default': '#BDC3C7' + "section_header": "#FF6B6B", + "text": "#4ECDC4", + "figure": "#45B7D1", + "caption": "#96CEB4", + "page_footer": "#FFEAA7", + "page_header": "#DDA0DD", + "table": "#98D8C8", + "list": "#F7DC6F", + "default": "#BDC3C7", } - - def _parse_page_selection(self, page_selection: Union[str, None], total_pages: int) -> Set[int]: + + def _parse_page_selection( + self, page_selection: Union[str, None], total_pages: int + ) -> Set[int]: """Parse page selection string and return set of page indices (0-based). - + Args: page_selection: Selection string or None total_pages: Total number of pages available - + Returns: Set of 0-based page indices to display """ # Handle None or "all" - return all pages if page_selection is None or page_selection.lower() == "all": return set(range(total_pages)) - + selected_pages = set() - + # Clean the input page_selection = page_selection.strip() - + # Split by commas for multiple selections - parts = page_selection.split(',') - + parts = page_selection.split(",") + for part in parts: part = part.strip() - + # Check if it's a range (contains hyphen) - if '-' in part: + if "-" in part: try: # Split range and convert to integers - range_parts = part.split('-') + range_parts = part.split("-") if len(range_parts) == 2: start = int(range_parts[0].strip()) end = int(range_parts[1].strip()) - + # Convert from 1-indexed to 0-indexed start_idx = start - 1 end_idx = end - 1 - + # Add all pages in range (inclusive) for i in range(start_idx, end_idx + 1): if 0 <= i < total_pages: @@ -207,21 +210,27 @@ def _parse_page_selection(self, page_selection: Union[str, None], total_pages: i if 0 <= page_idx < total_pages: selected_pages.add(page_idx) else: - print(f"Warning: Page {page_num} is out of range (1-{total_pages})") + print( + f"Warning: Page {page_num} is out of range (1-{total_pages})" + ) except ValueError: print(f"Warning: Invalid page number '{part}' in page selection") - + # If no valid pages were selected, default to all pages if not selected_pages: - print(f"Warning: No valid pages in selection '{page_selection}'. Showing all pages.") + print( + f"Warning: No valid pages in selection '{page_selection}'. Showing all pages." + ) return set(range(total_pages)) - + return selected_pages - + def _get_element_color(self, element_type: str) -> str: """Get color for element type.""" - return self.element_colors.get(element_type.lower(), self.element_colors['default']) - + return self.element_colors.get( + element_type.lower(), self.element_colors["default"] + ) + def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]: """Get dimensions of an image file.""" try: @@ -232,18 +241,18 @@ def _get_image_dimensions(self, image_path: str) -> Optional[Tuple[int, int]]: except Exception as e: print(f"Error getting image dimensions for {image_path}: {e}") return None - + def _load_image_as_base64(self, image_path: str) -> Optional[str]: """Load image from file path and convert to base64.""" try: if os.path.exists(image_path): - with open(image_path, 'rb') as img_file: + with open(image_path, "rb") as img_file: img_data = img_file.read() - img_base64 = base64.b64encode(img_data).decode('utf-8') + img_base64 = base64.b64encode(img_data).decode("utf-8") ext = os.path.splitext(image_path)[1].lower() - if ext in ['.jpg', '.jpeg']: + if ext in [".jpg", ".jpeg"]: return f"data:image/jpeg;base64,{img_base64}" - elif ext in ['.png']: + elif ext in [".png"]: return f"data:image/png;base64,{img_base64}" else: return f"data:image/jpeg;base64,{img_base64}" @@ -251,25 +260,25 @@ def _load_image_as_base64(self, image_path: str) -> Optional[str]: except Exception as e: print(f"Error loading image {image_path}: {e}") return None - + def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> str: """Render element content with appropriate formatting for both tooltip and element list display. - + Args: element: The element dictionary containing content/description for_tooltip: Whether this is for tooltip display (affects styling and truncation) """ - element_type = element.get('type', 'unknown') - content = element.get('content', '') - description = element.get('description', '') - + element_type = element.get("type", "unknown") + content = element.get("content", "") + description = element.get("description", "") + display_content = "" - + if content: - if element_type == 'table': + if element_type == "table": # Render the HTML table with styling table_html = content - + # Apply different styling based on context if for_tooltip: # Compact styling for tooltips with light theme @@ -284,17 +293,17 @@ def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> s th_style = 'style="border: 1px solid #ddd; padding: 8px; background: #f5f5f5; font-weight: bold; text-align: left;"' td_style = 'style="border: 1px solid #ddd; padding: 8px;"' thead_style = 'style="background: #f0f0f0;"' - + # Apply styling transformations - if '
' in table_html: + table_html = table_html.replace('', f'') + if '' in table_html: + table_html = table_html.replace('', f'') + if '
or tags in first row + import re + + # Find first row (either in thead or tbody) + first_row_match = re.search(r']*>(.*?)
' in table_html: - table_html = table_html.replace('
', f'
') - if '' in table_html: - table_html = table_html.replace('', f'') - + if "
' in table_html: - table_html = table_html.replace('', f'') - if '' in table_html: - table_html = table_html.replace('', f'') - if '
" in table_html: + table_html = table_html.replace("
", f"
") + if "" in table_html: + table_html = table_html.replace("", f"") + if for_tooltip: display_content = table_html else: @@ -303,71 +312,85 @@ def _render_element_content(self, element: Dict, for_tooltip: bool = False) -> s # Regular content handling if for_tooltip and len(content) > 500: # Truncate for tooltip display and escape HTML for safety - display_content = self._escape_for_html_attribute(content[:500] + "...") + display_content = self._escape_for_html_attribute( + content[:500] + "..." + ) else: - display_content = self._escape_for_html_attribute(content) if for_tooltip else content + display_content = ( + self._escape_for_html_attribute(content) + if for_tooltip + else content + ) elif description: desc_content = description if for_tooltip and len(desc_content) > 500: desc_content = desc_content[:500] + "..." - + if for_tooltip: - display_content = self._escape_for_html_attribute(f"Description: {desc_content}") + display_content = self._escape_for_html_attribute( + f"Description: {desc_content}" + ) else: display_content = f"Description: {desc_content}" else: - display_content = "No content available" if for_tooltip else "No content" - + display_content = ( + "No content available" if for_tooltip else "No content" + ) + return display_content - + def _escape_for_html_attribute(self, text: str) -> str: """Escape text for safe use in HTML attributes.""" - return (text.replace('&', '&') - .replace('<', '<') - .replace('>', '>') - .replace('"', '"') - .replace("'", ''') - .replace('\n', '
')) - + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + .replace("\n", "
") + ) + def _calculate_tooltip_width(self, element: Dict, image_width: int) -> int: """Calculate dynamic tooltip width based on table content.""" - element_type = element.get('type', 'unknown') - content = element.get('content', '') - - if element_type == 'table' and content: + element_type = element.get("type", "unknown") + content = element.get("content", "") + + if element_type == "table" and content: # Count columns by looking for ', content, re.DOTALL | re.IGNORECASE) + first_row_match = re.search( + r"]*>(.*?)", content, re.DOTALL | re.IGNORECASE + ) if first_row_match: first_row = first_row_match.group(1) # Count th or td tags - th_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) - td_count = len(re.findall(r']*>', first_row, re.IGNORECASE)) + th_count = len(re.findall(r"]*>", first_row, re.IGNORECASE)) + td_count = len(re.findall(r"]*>", first_row, re.IGNORECASE)) column_count = max(th_count, td_count) - + if column_count > 0: # Base width + additional width per column base_width = 300 width_per_column = 80 calculated_width = base_width + (column_count * width_per_column) - + # Cap at 4/5th of image width max_width = int(image_width * 0.8) return min(calculated_width, max_width) - + # Default width for non-tables or when calculation fails return 400 - + def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: """Create annotated image with SCALING to fit within 1024px width.""" - image_uri = page.get('image_uri', '') - page_id = page.get('id', 0) - + image_uri = page.get("image_uri", "") + page_id = page.get("id", 0) + if not image_uri: return "

No image URI found for this page

" - + # Load image img_data_uri = self._load_image_as_base64(image_uri) if not img_data_uri: @@ -377,7 +400,7 @@ def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: Make sure the file exists and is accessible. """ - + # Get original image dimensions original_dimensions = self._get_image_dimensions(image_uri) if not original_dimensions: @@ -385,38 +408,35 @@ def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: original_width, original_height = 1024, 768 # Default fallback else: original_width, original_height = original_dimensions - + # Calculate scaling factor to fit within 1024px width max_display_width = 1024 scale_factor = 1.0 display_width = original_width display_height = original_height - + if original_width > max_display_width: scale_factor = max_display_width / original_width display_width = max_display_width display_height = int(original_height * scale_factor) - + # Filter elements for this page and collect their bounding boxes page_elements = [] - + for elem in elements: elem_bboxes = [] - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) == page_id: - coord = bbox.get('coord', []) + for bbox in elem.get("bbox", []): + if bbox.get("page_id", 0) == page_id: + coord = bbox.get("coord", []) if len(coord) >= 4: elem_bboxes.append(bbox) - + if elem_bboxes: - page_elements.append({ - 'element': elem, - 'bboxes': elem_bboxes - }) - + page_elements.append({"element": elem, "bboxes": elem_bboxes}) + if not page_elements: return f"

No elements found for page {page_id}

" - + header_info = f"""
Page {page_id + 1}: {len(page_elements)} elements
@@ -425,54 +445,54 @@ def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: Scale factor: {scale_factor:.3f}
""" - + # Generate unique container ID for this page container_id = f"page_container_{page_id}_{id(self)}" - + # Create bounding box overlays using SCALED coordinates with hover functionality overlays = [] - + for idx, item in enumerate(page_elements): - element = item['element'] - element_id = element.get('id', 'N/A') - element_type = element.get('type', 'unknown') + element = item["element"] + element_id = element.get("id", "N/A") + element_type = element.get("type", "unknown") color = self._get_element_color(element_type) - + # Use the shared content renderer for tooltip tooltip_content = self._render_element_content(element, for_tooltip=True) - + # Calculate dynamic tooltip width tooltip_width = self._calculate_tooltip_width(element, display_width) - + # Tables should render as HTML, other content should be escaped - - for bbox_idx, bbox in enumerate(item['bboxes']): - coord = bbox.get('coord', []) + + for bbox_idx, bbox in enumerate(item["bboxes"]): + coord = bbox.get("coord", []) if len(coord) >= 4: x1, y1, x2, y2 = coord - + # Apply scaling to coordinates scaled_x1 = x1 * scale_factor scaled_y1 = y1 * scale_factor scaled_x2 = x2 * scale_factor scaled_y2 = y2 * scale_factor - + width = scaled_x2 - scaled_x1 height = scaled_y2 - scaled_y1 - + # Skip invalid boxes if width <= 0 or height <= 0: continue - + # Position label above box when possible label_top = -18 if scaled_y1 >= 18 else 2 - + # Unique ID for this bounding box box_id = f"bbox_{page_id}_{idx}_{bbox_idx}" - + # Calculate tooltip position (prefer right side, but switch to left if needed) tooltip_left = 10 - + overlay = f"""
str: white-space: nowrap; border-radius: 2px; box-shadow: 0 1px 2px rgba(0,0,0,0.3); pointer-events: none; - max-width: {max(50, width-4):.0f}px; + max-width: {max(50, width - 4):.0f}px; overflow: hidden; z-index: 1000;"> {element_type.upper()[:6]}#{element_id} @@ -526,7 +546,7 @@ def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str:
""" overlays.append(overlay) - + # Pure CSS hover functionality (works in Databricks) styles = f""" """ - + return f""" {header_info} {styles} @@ -560,56 +580,58 @@ def _create_annotated_image(self, page: Dict, elements: List[Dict]) -> str: Page {page_id + 1} - {''.join(overlays)} + {"".join(overlays)} """ - + def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str: """Create a detailed list of elements for a specific page.""" # Filter elements for this page page_elements = [] - + for elem in elements: elem_bboxes = [] - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) == page_id: + for bbox in elem.get("bbox", []): + if bbox.get("page_id", 0) == page_id: elem_bboxes.append(bbox) - + if elem_bboxes: page_elements.append(elem) - + if not page_elements: return f"

No elements found for page {page_id + 1}

" - + html_parts = [] - + for element in page_elements: - element_id = element.get('id', 'N/A') - element_type = element.get('type', 'unknown') + element_id = element.get("id", "N/A") + element_type = element.get("type", "unknown") color = self._get_element_color(element_type) - + # Get bounding box info for this page only bbox_info = "No bbox" - bbox_list = element.get('bbox', []) + bbox_list = element.get("bbox", []) if bbox_list: bbox_details = [] for bbox in bbox_list: - if bbox.get('page_id', 0) == page_id: - coord = bbox.get('coord', []) + if bbox.get("page_id", 0) == page_id: + coord = bbox.get("coord", []) if len(coord) >= 4: - bbox_details.append(f"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]") + bbox_details.append( + f"[{coord[0]:.0f}, {coord[1]:.0f}, {coord[2]:.0f}, {coord[3]:.0f}]" + ) bbox_info = "; ".join(bbox_details) if bbox_details else "Invalid bbox" - + # Use the shared content renderer for element list display display_content = self._render_element_content(element, for_tooltip=False) - + element_html = f"""

- {element_type.upper().replace('_', ' ')} (ID: {element_id}) + {element_type.upper().replace("_", " ")} (ID: {element_id})

{bbox_info} @@ -621,34 +643,36 @@ def _create_page_elements_list(self, page_id: int, elements: List[Dict]) -> str:
""" html_parts.append(element_html) - + return f"""

📋 Page {page_id + 1} Elements ({len(page_elements)} items)

- {''.join(html_parts)} + {"".join(html_parts)}
""" - - def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int) -> str: + + def _create_summary( + self, document: Dict, metadata: Dict, selected_pages: Set[int], total_pages: int + ) -> str: """Create a summary with page selection info.""" - elements = document.get('elements', []) - + elements = document.get("elements", []) + # Count elements only on selected pages selected_elements = [] for elem in elements: - for bbox in elem.get('bbox', []): - if bbox.get('page_id', 0) in selected_pages: + for bbox in elem.get("bbox", []): + if bbox.get("page_id", 0) in selected_pages: selected_elements.append(elem) break - + # Count by type (for selected pages) type_counts = {} for elem in selected_elements: - elem_type = elem.get('type', 'unknown') + elem_type = elem.get("type", "unknown") type_counts[elem_type] = type_counts.get(elem_type, 0) + 1 - - type_list = ', '.join([f"{t}: {c}" for t, c in type_counts.items()]) - + + type_list = ", ".join([f"{t}: {c}" for t, c in type_counts.items()]) + # Create page selection info if len(selected_pages) == total_pages: page_info = f"All {total_pages} pages" @@ -659,20 +683,22 @@ def _create_summary(self, document: Dict, metadata: Dict, selected_pages: Set[in page_info = f"Pages {', '.join(map(str, page_nums))} ({len(selected_pages)} of {total_pages})" else: page_info = f"{len(selected_pages)} of {total_pages} pages selected" - + return f"""

📄 Document Summary

Displaying: {page_info}

Elements on selected pages: {len(selected_elements)}

-

Element Types: {type_list if type_list else 'None'}

-

Document ID: {str(metadata.get('id', 'N/A'))[:12]}...

+

Element Types: {type_list if type_list else "None"}

+

Document ID: {str(metadata.get("id", "N/A"))[:12]}...

""" - - def render_document(self, parsed_result: Any, page_selection: Union[str, None] = None) -> None: + + def render_document( + self, parsed_result: Any, page_selection: Union[str, None] = None + ) -> None: """Main render function with page selection support. - + Args: parsed_result: The parsed document result page_selection: Page selection string. Supported formats: @@ -684,85 +710,100 @@ def render_document(self, parsed_result: Any, page_selection: Union[str, None] = """ try: # Convert to dict - if hasattr(parsed_result, 'toPython'): + if hasattr(parsed_result, "toPython"): parsed_dict = parsed_result.toPython() - elif hasattr(parsed_result, 'toJson'): + elif hasattr(parsed_result, "toJson"): parsed_dict = json.loads(parsed_result.toJson()) elif isinstance(parsed_result, dict): parsed_dict = parsed_result else: - display(HTML(f"

❌ Could not convert result. Type: {type(parsed_result)}

")) + display( + HTML( + f"

❌ Could not convert result. Type: {type(parsed_result)}

" + ) + ) return - + # Extract components - document = parsed_dict.get('document', {}) - pages = document.get('pages', []) - elements = document.get('elements', []) - metadata = parsed_dict.get('metadata', {}) - + document = parsed_dict.get("document", {}) + pages = document.get("pages", []) + elements = document.get("elements", []) + metadata = parsed_dict.get("metadata", {}) + if not elements: - display(HTML("

❌ No elements found in document

")) + display( + HTML("

❌ No elements found in document

") + ) return - + # Parse page selection selected_pages = self._parse_page_selection(page_selection, len(pages)) - + # Display title display(HTML("

🔍 AI Parse Document Results

")) - + # Display summary with page selection info - summary_html = self._create_summary(document, metadata, selected_pages, len(pages)) + summary_html = self._create_summary( + document, metadata, selected_pages, len(pages) + ) display(HTML(summary_html)) - + # Display color legend legend_items = [] for elem_type, color in self.element_colors.items(): - if elem_type != 'default': + if elem_type != "default": legend_items.append(f""" - {elem_type.replace('_', ' ').title()} + {elem_type.replace("_", " ").title()} """) - - display(HTML(f""" + + display( + HTML(f"""
🎨 Element Colors:
- {''.join(legend_items)} + {"".join(legend_items)}
- """)) - + """) + ) + # Display annotated images with their corresponding elements (filtered by selection) if pages: display(HTML("

🖼️ Annotated Images & Elements

")) - + # Sort selected pages for display sorted_selected = sorted(selected_pages) - + for page_idx in sorted_selected: if page_idx < len(pages): page = pages[page_idx] - + # Display the annotated image annotated_html = self._create_annotated_image(page, elements) - display(HTML(f"
{annotated_html}
")) - + display( + HTML(f"
{annotated_html}
") + ) + # Display elements for this page immediately after the image - page_id = page.get('id', page_idx) - page_elements_html = self._create_page_elements_list(page_id, elements) + page_id = page.get("id", page_idx) + page_elements_html = self._create_page_elements_list( + page_id, elements + ) display(HTML(page_elements_html)) - + except Exception as e: display(HTML(f"

❌ Error: {str(e)}

")) import traceback + display(HTML(f"
{traceback.format_exc()}
")) # Simple usage functions def render_ai_parse_output(parsed_result, page_selection=None): """Simple function to render ai_parse_document output with page selection. - + Args: parsed_result: The parsed document result page_selection: Optional page selection string. Examples: @@ -775,8 +816,9 @@ def render_ai_parse_output(parsed_result, page_selection=None): renderer = DocumentRenderer() renderer.render_document(parsed_result, page_selection) + # COMMAND ---------- # DBTITLE 1,Debug Visualization Results for parsed_result in parsed_results: - render_ai_parse_output(parsed_result, page_selection) \ No newline at end of file + render_ai_parse_output(parsed_result, page_selection) diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py b/contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py index 4c3e2bb6..297aad29 100644 --- a/contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py +++ b/contrib/workflow_with_ai_parse_document/src/transformations/01_parse_documents.py @@ -9,9 +9,17 @@ # Get parameters dbutils.widgets.text("catalog", "main", "Catalog name") dbutils.widgets.text("schema", "default", "Schema name") -dbutils.widgets.text("source_volume_path", "/Volumes/main/default/source_documents", "Source volume path") -dbutils.widgets.text("output_volume_path", "/Volumes/main/default/parsed_output", "Output volume path") -dbutils.widgets.text("checkpoint_location", "/Volumes/main/default/checkpoints/parse_documents", "Checkpoint location") +dbutils.widgets.text( + "source_volume_path", "/Volumes/main/default/source_documents", "Source volume path" +) +dbutils.widgets.text( + "output_volume_path", "/Volumes/main/default/parsed_output", "Output volume path" +) +dbutils.widgets.text( + "checkpoint_location", + "/Volumes/main/default/checkpoints/parse_documents", + "Checkpoint location", +) dbutils.widgets.text("table_name", "parsed_documents_raw", "Output table name") catalog = dbutils.widgets.get("catalog") @@ -30,19 +38,28 @@ # COMMAND ---------- from pyspark.sql.functions import col, current_timestamp, expr -from pyspark.sql.types import StructType, StructField, StringType, BinaryType, TimestampType, LongType +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + BinaryType, + TimestampType, + LongType, +) # Define schema for binary files (must match exact schema expected by binaryFile format) -binary_file_schema = StructType([ - StructField("path", StringType(), False), - StructField("modificationTime", TimestampType(), False), - StructField("length", LongType(), False), - StructField("content", BinaryType(), True) -]) +binary_file_schema = StructType( + [ + StructField("path", StringType(), False), + StructField("modificationTime", TimestampType(), False), + StructField("length", LongType(), False), + StructField("content", BinaryType(), True), + ] +) # Read files using Structured Streaming -files_df = (spark.readStream - .format("binaryFile") +files_df = ( + spark.readStream.format("binaryFile") .schema(binary_file_schema) .option("pathGlobFilter", "*.{pdf,jpg,jpeg,png}") .option("recursiveFileLookup", "true") @@ -50,9 +67,10 @@ ) # Parse documents with ai_parse_document -parsed_df = (files_df - .repartition(8, expr("crc32(path) % 8")) - .withColumn("parsed", +parsed_df = ( + files_df.repartition(8, expr("crc32(path) % 8")) + .withColumn( + "parsed", expr(f""" ai_parse_document( content, @@ -62,15 +80,15 @@ 'descriptionElementTypes', '*' ) ) - """) + """), ) .withColumn("parsed_at", current_timestamp()) .select("path", "parsed", "parsed_at") ) # Write to Delta table with streaming -(parsed_df.writeStream - .format("delta") +( + parsed_df.writeStream.format("delta") .outputMode("append") .option("checkpointLocation", checkpoint_location) .option("delta.feature.variantType-preview", "supported") diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py b/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py index d46b8d23..29fa9097 100644 --- a/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py +++ b/contrib/workflow_with_ai_parse_document/src/transformations/02_extract_text.py @@ -9,7 +9,11 @@ # Get parameters dbutils.widgets.text("catalog", "main", "Catalog name") dbutils.widgets.text("schema", "default", "Schema name") -dbutils.widgets.text("checkpoint_location", "/Volumes/main/default/checkpoints/extract_text", "Checkpoint location") +dbutils.widgets.text( + "checkpoint_location", + "/Volumes/main/default/checkpoints/extract_text", + "Checkpoint location", +) dbutils.widgets.text("source_table_name", "parsed_documents_raw", "Source table name") dbutils.widgets.text("table_name", "parsed_documents_text", "Output table name") @@ -30,21 +34,18 @@ from pyspark.sql.functions import col, concat_ws, expr, lit, when # Read from source table using Structured Streaming -parsed_stream = (spark.readStream - .format("delta") - .table(source_table_name) -) +parsed_stream = spark.readStream.format("delta").table(source_table_name) # Extract text from parsed documents -text_df = parsed_stream.withColumn( - "text", - when( - expr("try_cast(parsed:error_status AS STRING)").isNotNull(), - lit(None) - ).otherwise( - concat_ws( - "\n\n", - expr(""" +text_df = ( + parsed_stream.withColumn( + "text", + when( + expr("try_cast(parsed:error_status AS STRING)").isNotNull(), lit(None) + ).otherwise( + concat_ws( + "\n\n", + expr(""" transform( CASE WHEN try_cast(parsed:metadata:version AS STRING) = '1.0' @@ -53,17 +54,17 @@ END, element -> try_cast(element:content AS STRING) ) - """) - ) + """), + ) + ), ) -).withColumn( - "error_status", - expr("try_cast(parsed:error_status AS STRING)") -).select("path", "text", "error_status", "parsed_at") + .withColumn("error_status", expr("try_cast(parsed:error_status AS STRING)")) + .select("path", "text", "error_status", "parsed_at") +) # Write to Delta table with streaming -(text_df.writeStream - .format("delta") +( + text_df.writeStream.format("delta") .outputMode("append") .option("checkpointLocation", checkpoint_location) .option("mergeSchema", "true") diff --git a/contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py b/contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py index a78a2ad0..7320711c 100644 --- a/contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py +++ b/contrib/workflow_with_ai_parse_document/src/transformations/03_extract_structured_data.py @@ -9,7 +9,11 @@ # Get parameters dbutils.widgets.text("catalog", "main", "Catalog name") dbutils.widgets.text("schema", "default", "Schema name") -dbutils.widgets.text("checkpoint_location", "/Volumes/main/default/checkpoints/extract_structured", "Checkpoint location") +dbutils.widgets.text( + "checkpoint_location", + "/Volumes/main/default/checkpoints/extract_structured", + "Checkpoint location", +) dbutils.widgets.text("source_table_name", "parsed_documents_text", "Source table name") dbutils.widgets.text("table_name", "parsed_documents_structured", "Output table name") @@ -30,20 +34,21 @@ from pyspark.sql.functions import col, concat, current_timestamp, expr, length, lit # Read from source table using Structured Streaming -text_stream = (spark.readStream - .format("delta") +text_stream = ( + spark.readStream.format("delta") .table(source_table_name) .filter( - (col("text").isNotNull()) & - (col("error_status").isNull()) & - (length(col("text")) > 100) + (col("text").isNotNull()) + & (col("error_status").isNull()) + & (length(col("text")) > 100) ) ) # Extract structured data using ai_query -structured_df = text_stream.withColumn( - "extracted_json", - expr(""" +structured_df = ( + text_stream.withColumn( + "extracted_json", + expr(""" ai_query( 'databricks-claude-sonnet-4', concat( @@ -59,14 +64,15 @@ 'temperature', 0.1 ) ) - """) -).withColumn( - "extraction_timestamp", current_timestamp() -).select("path", "extracted_json", "parsed_at", "extraction_timestamp") + """), + ) + .withColumn("extraction_timestamp", current_timestamp()) + .select("path", "extracted_json", "parsed_at", "extraction_timestamp") +) # Write to Delta table with streaming -(structured_df.writeStream - .format("delta") +( + structured_df.writeStream.format("delta") .outputMode("append") .option("checkpointLocation", checkpoint_location) .option("mergeSchema", "true")
" in table_html: + table_html = table_html.replace("", f"") + if "" in table_html: + table_html = table_html.replace("", f"") + if "
or tags in first row import re - + # Find first row (either in thead or tbody) - first_row_match = re.search(r']*>(.*?)