From 0f2a75b719fe7a75c1123a7fc8d27e7a65eaf69e Mon Sep 17 00:00:00 2001 From: aliamerj Date: Sun, 27 Apr 2025 11:23:23 +0300 Subject: [PATCH 1/2] =?UTF-8?q?Switch=20PDF=E2=80=90processing=20to=20an?= =?UTF-8?q?=20internal=20FastAPI=20microservice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.dev.yml | 7 +- docker-compose.prod.yml | 8 +++ fileProcessors/Files/pdf.ts | 92 ++++++------------------ fileProcessors/connectors/aws.ts | 7 +- fileProcessors/connectors/dropbox.ts | 5 +- fileProcessors/connectors/googleDrive.ts | 9 ++- fileProcessors/index.ts | 5 +- scripts/process_pdf.py | 64 ----------------- scripts/pyrightconfig.json | 5 -- scripts/requirements.txt | 2 - svc_parser/.dockerignore | 4 ++ svc_parser/.gitignore | 2 + svc_parser/Dockerfile | 13 ++++ svc_parser/app/__init__.py | 0 svc_parser/app/main.py | 75 +++++++++++++++++++ svc_parser/pyrightconfig.json | 5 ++ svc_parser/requirements.txt | 4 ++ 17 files changed, 156 insertions(+), 151 deletions(-) delete mode 100644 scripts/process_pdf.py delete mode 100644 scripts/pyrightconfig.json delete mode 100644 scripts/requirements.txt create mode 100644 svc_parser/.dockerignore create mode 100644 svc_parser/.gitignore create mode 100644 svc_parser/Dockerfile create mode 100644 svc_parser/app/__init__.py create mode 100644 svc_parser/app/main.py create mode 100644 svc_parser/pyrightconfig.json create mode 100644 svc_parser/requirements.txt diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index e3d0252..6bf9749 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -28,7 +28,12 @@ services: - "6333:6333" volumes: - qdrant_data:/data - + parser: + build: + context: ./svc_parser/ + dockerfile: Dockerfile + ports: + - "9000:9000" volumes: postgres_data: redis_data: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 61a60a3..15b8661 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -28,6 +28,12 @@ services: - "6333:6333" volumes: - qdrant_data:/data + parser: + build: + context: ./svc_parser/ + dockerfile: Dockerfile + ports: + - "9000:9000" app: build: context: . @@ -39,6 +45,7 @@ services: HOSTNAME: "0.0.0.0" DCUP_DOC_URL: https://dcup.dev/docs NEXTAUTH_URL: http://localhost:8080 + DCUP_PARSER: http://parser:9000 NEXTAUTH_SECRET: ${NEXTAUTH_SECRET} API_SECRET: ${API_SECRET} @@ -74,6 +81,7 @@ services: - postgres - redis - qdrant + - parser volumes: postgres_data: diff --git a/fileProcessors/Files/pdf.ts b/fileProcessors/Files/pdf.ts index b4b0281..a1a3e19 100644 --- a/fileProcessors/Files/pdf.ts +++ b/fileProcessors/Files/pdf.ts @@ -1,78 +1,26 @@ import { PageContent } from "@/fileProcessors"; -import { spawn } from "child_process" -import path from "path" -export const processPdfBuffer = async (fileContent: Buffer): Promise => { - return new Promise((resolve, reject) => { - const python = path.join(process.cwd(), "scripts", "venv", "bin", "python3"); - const script = path.join(process.cwd(), "scripts", "process_pdf.py"); - - const processChild = spawn(python, [script]); - - let output = ''; - let errorOutput = ''; - - processChild.stdout.on('data', (data) => { - output += data.toString(); - }); - - processChild.stderr.on('data', (data) => { - errorOutput += data.toString(); - }); - - processChild.on('close', (code) => { - if (code !== 0) { - return reject(new Error(`Python process exited with code ${code}: ${errorOutput}`)); - } - try { - const result = JSON.parse(output); - if (result.error) { - return reject(new Error(result.error)); - } - resolve(result); - } catch (err) { - reject(err); - } - }); - - // Send the Buffer data to Python process - processChild.stdin.write(fileContent); - processChild.stdin.end(); // Close the stdin stream - }); +export const processPdfBuffer = async (fileContent: Blob): Promise => { + const form = new FormData(); + form.set("upload", fileContent); + const res = await fetch(process.env.DCUP_PARSER +"/process/pdf/file", { + method: 'POST', + body: form + }) + + const body = await res.json() + if (!res.ok) throw new Error(body.detail) + return body }; export const processPdfLink = async (pdfUrl: string): Promise => { - return new Promise((resolve, reject) => { - const python = path.join(process.cwd(), "scripts", "venv", "bin", "python3"); - const script = path.join(process.cwd(), "scripts", "process_pdf.py"); - - // Pass the pdfUrl as a command-line argument - const processChild = spawn(python, [script, pdfUrl]); - - let output = ''; - let errorOutput = ''; - - processChild.stdout.on('data', (data) => { - output += data.toString(); - }); - - processChild.stderr.on('data', (data) => { - errorOutput += data.toString(); - }); - - processChild.on('close', (code) => { - if (code !== 0) { - return reject(new Error(`Python process exited with code ${code}: ${errorOutput}`)); - } - try { - const result = JSON.parse(output); - if (result.error) { - return reject(new Error(result.error)); - } - resolve(result); - } catch (err) { - reject(err); - } - }); - }); + const form = new FormData(); + form.set("url", pdfUrl); + const res = await fetch(process.env.DCUP_PARSER +"/process/pdf/url", { + method: 'POST', + body: form + }) + const body = await res.json() + if (!res.ok) throw new Error(body.detail) + return body }; diff --git a/fileProcessors/connectors/aws.ts b/fileProcessors/connectors/aws.ts index 9d59f19..ec9e737 100644 --- a/fileProcessors/connectors/aws.ts +++ b/fileProcessors/connectors/aws.ts @@ -60,7 +60,7 @@ export const readAWSFiles = async ( const response = await s3Client.send(getCommand); if (!response.Body) continue; - const buffer = await streamToBuffer(response.Body) + const buffer = await streamToBlob(response.Body) const content = await processPdfBuffer(buffer); const fileName = fileKey.split('/').pop() || fileKey; @@ -105,3 +105,8 @@ async function streamToBuffer(stream: Readable): Promise { }); }); } + +async function streamToBlob(stream: Readable): Promise { + const buffer = await streamToBuffer(stream); // Convert stream to buffer + return new Blob([buffer]); // Convert buffer to Blob +} diff --git a/fileProcessors/connectors/dropbox.ts b/fileProcessors/connectors/dropbox.ts index 7c54672..8a6b954 100644 --- a/fileProcessors/connectors/dropbox.ts +++ b/fileProcessors/connectors/dropbox.ts @@ -138,9 +138,8 @@ export const readDropboxFiles = async ( }, }); - const fileStream = await response.arrayBuffer(); - const buf = Buffer.from(fileStream); - const content = await processPdfBuffer(buf); + const blob = await response.blob(); + const content = await processPdfBuffer(blob); const fileContent: FileContent = { name: file.name || "", pages: content, diff --git a/fileProcessors/connectors/googleDrive.ts b/fileProcessors/connectors/googleDrive.ts index 132ca87..be65065 100644 --- a/fileProcessors/connectors/googleDrive.ts +++ b/fileProcessors/connectors/googleDrive.ts @@ -101,8 +101,8 @@ export const readGoogleDriveFiles = async ( alt: "media", }, { responseType: 'stream' }); - const buf = await streamToBuffer(res.data); - const content = await processPdfBuffer(buf); + const blob = await streamToBlob(res.data); + const content = await processPdfBuffer(blob); const fileContent: FileContent = { name: file.name || "", @@ -155,3 +155,8 @@ async function streamToBuffer(stream: Readable): Promise { }); }); } + +async function streamToBlob(stream: Readable): Promise { + const buffer = await streamToBuffer(stream); // Convert stream to buffer + return new Blob([buffer]); // Convert buffer to Blob +} diff --git a/fileProcessors/index.ts b/fileProcessors/index.ts index 038ebee..371d9dc 100644 --- a/fileProcessors/index.ts +++ b/fileProcessors/index.ts @@ -27,7 +27,10 @@ export type PageContent = { export const directProcessFiles = async ({ files, metadata, service, connectionId, links, pageLimit, fileLimit }: TQueue) => { // Create promises for processing file URLs const filePromises = files.map(async (file) => { - const content = await processPdfBuffer(Buffer.from(file.content, 'base64')); + const arrayBuffer = Buffer.from(file.content, 'base64').buffer; + + const content = await processPdfBuffer(new Blob([arrayBuffer])); + return { name: file.name || "", pages: content, diff --git a/scripts/process_pdf.py b/scripts/process_pdf.py deleted file mode 100644 index 1331500..0000000 --- a/scripts/process_pdf.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -import io -import pdfplumber -import requests -import argparse -import json -import sys - - -def download_pdf(url: str) -> bytes: - try: - response = requests.get(url, timeout=10) - response.raise_for_status() - - if "application/pdf" not in response.headers.get("Content-Type", ""): - raise ValueError("URL does not point to a PDF file") - - return response.content - except requests.exceptions.RequestException as e: - raise ValueError(f"Failed to download PDF: {e}") - - -def extract_text_from_pdf(pdf_bytes: bytes): - pages_data = [] - try: - with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: - for page in pdf.pages: - page_text = page.extract_text() or "" - tables = page.extract_tables() or [] - pages_data.append({ - "text": page_text.strip(), - "tables": tables, - }) - return pages_data - except Exception as e: - raise ValueError(f"Error processing PDF: {e}") - - -def main(): - parser = argparse.ArgumentParser(description="Extract text from PDF URL") - parser.add_argument("input", nargs="?", type=str, help="URL of the PDF file") - args = parser.parse_args() - try: - if args.input and ( - args.input.startswith("http://") or args.input.startswith("https://") - ): - pdf_content = download_pdf(args.input) - else: - if not sys.stdin.isatty(): - pdf_content = sys.stdin.buffer.read() - if not pdf_content: - raise ValueError("No PDF data provided via STDIN.") - else: - raise ValueError("No valid URL provided and no PDF data piped.") - - result = extract_text_from_pdf(pdf_content) - except Exception as e: - result = {"error": str(e)} - - print(json.dumps(result)) - - -if __name__ == "__main__": - main() diff --git a/scripts/pyrightconfig.json b/scripts/pyrightconfig.json deleted file mode 100644 index 3b49290..0000000 --- a/scripts/pyrightconfig.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "exclude": [ "venv" ], - "venvPath": ".", - "venv": "venv", -} diff --git a/scripts/requirements.txt b/scripts/requirements.txt deleted file mode 100644 index b1e7659..0000000 --- a/scripts/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -pdfplumber -requests diff --git a/svc_parser/.dockerignore b/svc_parser/.dockerignore new file mode 100644 index 0000000..2ccad6d --- /dev/null +++ b/svc_parser/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +*.pyc +*.git +node_modules diff --git a/svc_parser/.gitignore b/svc_parser/.gitignore new file mode 100644 index 0000000..033df5f --- /dev/null +++ b/svc_parser/.gitignore @@ -0,0 +1,2 @@ +.venv +__pycache__ diff --git a/svc_parser/Dockerfile b/svc_parser/Dockerfile new file mode 100644 index 0000000..15f0e25 --- /dev/null +++ b/svc_parser/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.12.3 + +WORKDIR /code + +COPY ./requirements.txt /code/requirements.txt + +RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt + +COPY ./app /code/app + +EXPOSE 9000 + +CMD ["fastapi", "run", "app/main.py", "--port", "9000"] diff --git a/svc_parser/app/__init__.py b/svc_parser/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/svc_parser/app/main.py b/svc_parser/app/main.py new file mode 100644 index 0000000..976145e --- /dev/null +++ b/svc_parser/app/main.py @@ -0,0 +1,75 @@ +from fastapi import FastAPI, File, Form, Path, UploadFile, HTTPException +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from enum import Enum +import requests +import io +import pdfplumber + +app = FastAPI(title="File Processing Microservice") + + +class FileType(str, Enum): + pdf = "pdf" + + +class InputMode(str, Enum): + file = "file" + url = "url" + + +class PageContent(BaseModel): + text: str + tables: list[list[list[str]]] + + +@app.post( + "/process/{file_type}/{input_mode}", + response_model=list[PageContent], + summary="Process an uploaded file or URL", +) +async def process_pdf( + file_type: FileType = Path(..., description="Type of file to process"), + input_mode: InputMode = Path(..., description="How content is passed"), + upload: UploadFile | None = File(None, description="The file to upload"), + url: str | None = Form(None, description="Link to fetch"), +): + try: + if input_mode == InputMode.url: + if not url: + raise HTTPException(422, "Must provide a URL when input_mode is 'url'") + resp = requests.get(url, timeout=10) + resp.raise_for_status() + if file_type == FileType.pdf: + if "application/pdf" not in resp.headers.get("Content-Type", ""): + raise HTTPException(400,"URL does not point to a PDF file") + data = extract_text_from_pdf(resp.content) + return JSONResponse(content=data, status_code=200) + if input_mode == InputMode.file: + if not upload: + raise HTTPException(422, "Must upload a file when input_mode is 'file'") + data_bytes = await upload.read() + data = extract_text_from_pdf(data_bytes) + return JSONResponse(content=data, status_code=200) + + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") + + + +def extract_text_from_pdf(pdf_bytes: bytes): + pages_data = [] + try: + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + for page in pdf.pages: + page_text = page.extract_text() or "" + tables = page.extract_tables() or [] + pages_data.append({ + "text": page_text.strip(), + "tables": tables, + }) + return pages_data + except Exception as e: + raise ValueError(f"Error processing PDF: {e}") diff --git a/svc_parser/pyrightconfig.json b/svc_parser/pyrightconfig.json new file mode 100644 index 0000000..dfaba36 --- /dev/null +++ b/svc_parser/pyrightconfig.json @@ -0,0 +1,5 @@ +{ + "exclude": [ ".venv" ], + "venvPath": ".", + "venv": ".venv", +} diff --git a/svc_parser/requirements.txt b/svc_parser/requirements.txt new file mode 100644 index 0000000..3bf115c --- /dev/null +++ b/svc_parser/requirements.txt @@ -0,0 +1,4 @@ +fastapi[standard]==0.113.0 +pydantic==2.8.0 +pdfplumber +requests From ccde0289e79dd903f08f50882fa3b9aeffb5f7eb Mon Sep 17 00:00:00 2001 From: aliamerj Date: Sun, 27 Apr 2025 12:33:30 +0300 Subject: [PATCH 2/2] set up render config --- package.json | 1 - render.yaml | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 render.yaml diff --git a/package.json b/package.json index 35b035d..2c1144e 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,6 @@ "version": "0.0.0", "private": true, "scripts": { - "postinstall": "python3 -m venv scripts/venv && scripts/venv/bin/pip install -r scripts/requirements.txt", "build": "next build", "dev": "next dev", "start": "next start", diff --git a/render.yaml b/render.yaml new file mode 100644 index 0000000..4dfce5c --- /dev/null +++ b/render.yaml @@ -0,0 +1,16 @@ +services: + - type: web + name: dcup-web + runtime: docker + dockerContext: . + dockerfilePath: ./Dockerfile + httpPort: 8080 + + - type: pserv + name: parser + runtime: docker + dockerContext: ./svc_parser + dockerfilePath: ./svc_parser/Dockerfile + httpPort: 9000 + +