Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ services:
- "6333:6333"
volumes:
- qdrant_data:/data

parser:
build:
context: ./svc_parser/
dockerfile: Dockerfile
ports:
- "9000:9000"
volumes:
postgres_data:
redis_data:
Expand Down
8 changes: 8 additions & 0 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ services:
- "6333:6333"
volumes:
- qdrant_data:/data
parser:
build:
context: ./svc_parser/
dockerfile: Dockerfile
ports:
- "9000:9000"
app:
build:
context: .
Expand All @@ -39,6 +45,7 @@ services:
HOSTNAME: "0.0.0.0"
DCUP_DOC_URL: https://dcup.dev/docs
NEXTAUTH_URL: http://localhost:8080
DCUP_PARSER: http://parser:9000
NEXTAUTH_SECRET: ${NEXTAUTH_SECRET}
API_SECRET: ${API_SECRET}

Expand Down Expand Up @@ -74,6 +81,7 @@ services:
- postgres
- redis
- qdrant
- parser

volumes:
postgres_data:
Expand Down
92 changes: 20 additions & 72 deletions fileProcessors/Files/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,78 +1,26 @@
import { PageContent } from "@/fileProcessors";
import { spawn } from "child_process"
import path from "path"

export const processPdfBuffer = async (fileContent: Buffer): Promise<PageContent[]> => {
return new Promise((resolve, reject) => {
const python = path.join(process.cwd(), "scripts", "venv", "bin", "python3");
const script = path.join(process.cwd(), "scripts", "process_pdf.py");

const processChild = spawn(python, [script]);

let output = '';
let errorOutput = '';

processChild.stdout.on('data', (data) => {
output += data.toString();
});

processChild.stderr.on('data', (data) => {
errorOutput += data.toString();
});

processChild.on('close', (code) => {
if (code !== 0) {
return reject(new Error(`Python process exited with code ${code}: ${errorOutput}`));
}
try {
const result = JSON.parse(output);
if (result.error) {
return reject(new Error(result.error));
}
resolve(result);
} catch (err) {
reject(err);
}
});

// Send the Buffer data to Python process
processChild.stdin.write(fileContent);
processChild.stdin.end(); // Close the stdin stream
});
export const processPdfBuffer = async (fileContent: Blob): Promise<PageContent[]> => {
const form = new FormData();
form.set("upload", fileContent);
const res = await fetch(process.env.DCUP_PARSER +"/process/pdf/file", {
method: 'POST',
body: form
})

const body = await res.json()
if (!res.ok) throw new Error(body.detail)
return body
};

export const processPdfLink = async (pdfUrl: string): Promise<PageContent[]> => {
return new Promise((resolve, reject) => {
const python = path.join(process.cwd(), "scripts", "venv", "bin", "python3");
const script = path.join(process.cwd(), "scripts", "process_pdf.py");

// Pass the pdfUrl as a command-line argument
const processChild = spawn(python, [script, pdfUrl]);

let output = '';
let errorOutput = '';

processChild.stdout.on('data', (data) => {
output += data.toString();
});

processChild.stderr.on('data', (data) => {
errorOutput += data.toString();
});

processChild.on('close', (code) => {
if (code !== 0) {
return reject(new Error(`Python process exited with code ${code}: ${errorOutput}`));
}
try {
const result = JSON.parse(output);
if (result.error) {
return reject(new Error(result.error));
}
resolve(result);
} catch (err) {
reject(err);
}
});
});
const form = new FormData();
form.set("url", pdfUrl);
const res = await fetch(process.env.DCUP_PARSER +"/process/pdf/url", {
method: 'POST',
body: form
})
const body = await res.json()
if (!res.ok) throw new Error(body.detail)
return body
};
7 changes: 6 additions & 1 deletion fileProcessors/connectors/aws.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ export const readAWSFiles = async (

const response = await s3Client.send(getCommand);
if (!response.Body) continue;
const buffer = await streamToBuffer(response.Body)
const buffer = await streamToBlob(response.Body)
const content = await processPdfBuffer(buffer);
const fileName = fileKey.split('/').pop() || fileKey;

Expand Down Expand Up @@ -105,3 +105,8 @@ async function streamToBuffer(stream: Readable): Promise<Buffer> {
});
});
}

async function streamToBlob(stream: Readable): Promise<Blob> {
const buffer = await streamToBuffer(stream); // Convert stream to buffer
return new Blob([buffer]); // Convert buffer to Blob
}
5 changes: 2 additions & 3 deletions fileProcessors/connectors/dropbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ export const readDropboxFiles = async (
},
});

const fileStream = await response.arrayBuffer();
const buf = Buffer.from(fileStream);
const content = await processPdfBuffer(buf);
const blob = await response.blob();
const content = await processPdfBuffer(blob);
const fileContent: FileContent = {
name: file.name || "",
pages: content,
Expand Down
9 changes: 7 additions & 2 deletions fileProcessors/connectors/googleDrive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ export const readGoogleDriveFiles = async (
alt: "media",
}, { responseType: 'stream' });

const buf = await streamToBuffer(res.data);
const content = await processPdfBuffer(buf);
const blob = await streamToBlob(res.data);
const content = await processPdfBuffer(blob);

const fileContent: FileContent = {
name: file.name || "",
Expand Down Expand Up @@ -155,3 +155,8 @@ async function streamToBuffer(stream: Readable): Promise<Buffer> {
});
});
}

async function streamToBlob(stream: Readable): Promise<Blob> {
const buffer = await streamToBuffer(stream); // Convert stream to buffer
return new Blob([buffer]); // Convert buffer to Blob
}
5 changes: 4 additions & 1 deletion fileProcessors/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ export type PageContent = {
export const directProcessFiles = async ({ files, metadata, service, connectionId, links, pageLimit, fileLimit }: TQueue) => {
// Create promises for processing file URLs
const filePromises = files.map(async (file) => {
const content = await processPdfBuffer(Buffer.from(file.content, 'base64'));
const arrayBuffer = Buffer.from(file.content, 'base64').buffer;

const content = await processPdfBuffer(new Blob([arrayBuffer]));

return {
name: file.name || "",
pages: content,
Expand Down
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"version": "0.0.0",
"private": true,
"scripts": {
"postinstall": "python3 -m venv scripts/venv && scripts/venv/bin/pip install -r scripts/requirements.txt",
"build": "next build",
"dev": "next dev",
"start": "next start",
Expand Down
16 changes: 16 additions & 0 deletions render.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
services:
- type: web
name: dcup-web
runtime: docker
dockerContext: .
dockerfilePath: ./Dockerfile
httpPort: 8080

- type: pserv
name: parser
runtime: docker
dockerContext: ./svc_parser
dockerfilePath: ./svc_parser/Dockerfile
httpPort: 9000


64 changes: 0 additions & 64 deletions scripts/process_pdf.py

This file was deleted.

5 changes: 0 additions & 5 deletions scripts/pyrightconfig.json

This file was deleted.

2 changes: 0 additions & 2 deletions scripts/requirements.txt

This file was deleted.

4 changes: 4 additions & 0 deletions svc_parser/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__pycache__
*.pyc
*.git
node_modules
2 changes: 2 additions & 0 deletions svc_parser/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv
__pycache__
13 changes: 13 additions & 0 deletions svc_parser/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM python:3.12.3

WORKDIR /code

COPY ./requirements.txt /code/requirements.txt

RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

COPY ./app /code/app

EXPOSE 9000

CMD ["fastapi", "run", "app/main.py", "--port", "9000"]
Empty file added svc_parser/app/__init__.py
Empty file.
Loading