-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_files_extension.py
More file actions
145 lines (119 loc) · 4.34 KB
/
read_files_extension.py
File metadata and controls
145 lines (119 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""read_files_extension.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1kSEM6QQviItZBVpxQhZlntTBokNEk-xJ
"""
import os
from google.colab import drive
drive.mount('/content/drive/')
base_path = '/content/drive/My Drive/'
data_path_googleDrive='genai_sample_data/'
data_path =os.path.join(base_path, data_path_googleDrive)
print("--------Data Path in Google Drive \n")
print(data_path)
print("--------------------------------")
# List all files and folders in the directory
if os.path.exists(data_path):
file_paths = os.listdir(data_path)
print("Files and folders in directory:")
for f in file_paths:
print(f)
else:
print("The directory does not exist.")
print("File_paths : \n ")
print(file_paths)
import os
import mimetypes
# Initialize mimetypes
mimetypes.init()
# Define common raster image extensions
IMAGE_EXTENSIONS = ['.bmp', '.jpg', '.jpeg', '.png', '.gif', '.tif', '.tiff', '.webp']
def detect_file_format(file_path):
ext = os.path.splitext(file_path)[1].lower()
# Detect by extension
if ext == '.pdf':
return 'PDF'
elif ext in ['.doc', '.docx']:
return 'Word'
elif ext in ['.ppt', '.pptx']:
return 'PowerPoint'
elif ext in ['.txt', '.text']:
return 'Text'
elif ext in IMAGE_EXTENSIONS:
return 'Bitmap Image'
# Fallback: MIME type
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type:
if 'pdf' in mime_type:
return 'PDF'
elif 'msword' in mime_type or 'wordprocessingml' in mime_type:
return 'Word'
elif 'presentation' in mime_type or 'powerpoint' in mime_type:
return 'PowerPoint'
elif 'text' in mime_type:
return 'Text'
elif mime_type.startswith('image/'):
return 'Bitmap Image'
return 'Unknown format'
# Define processing functions for each file type
def process_pdf(file_path):
print(f"Processing PDF file: {file_path}")
# Add your PDF processing logic here
return "PDF processed"
def process_word(file_path):
print(f"Processing Word file: {file_path}")
# Add your Word processing logic here
return "Word processed"
def process_powerpoint(file_path):
print(f"Processing PowerPoint file: {file_path}")
# Add your PowerPoint processing logic here
return "PowerPoint processed"
def process_text(file_path):
print(f"Processing Text file: {file_path}")
# Add your Text processing logic here
return "Text processed"
def process_image(file_path):
print(f"Processing Image file: {file_path}")
# Add your Image processing logic here
return "Image processed"
def process_unknown(file_path):
print(f"Processing Unknown file type: {file_path}")
# Add your Unknown file processing logic here
return "Unknown file processed"
def process_folder(folder_path):
# Check if the folder exists
if not os.path.exists(folder_path):
print(f"The directory {folder_path} does not exist.")
return
# List all files in the directory
file_paths = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
if not file_paths:
print("No files found in the directory.")
return
# Get the first file
first_file = file_paths[0]
first_file_path = os.path.join(folder_path, first_file)
# Detect the file format
file_format = detect_file_format(first_file_path)
print(f"First file: {first_file}, Format: {file_format}")
# Call the appropriate processing function based on the file format
if file_format == 'PDF':
result = process_pdf(first_file_path)
elif file_format == 'Word':
result = process_word(first_file_path)
elif file_format == 'PowerPoint':
result = process_powerpoint(first_file_path)
elif file_format == 'Text':
result = process_text(first_file_path)
elif file_format == 'Bitmap Image':
result = process_image(first_file_path)
else:
result = process_unknown(first_file_path)
print(f"Result: {result}")
# Example usage
if __name__ == "__main__":
# Replace with your folder path
#folder_path = input("Enter the folder path: ") # e.g., '/content/drive/My Drive/genai_sample_data/'
folder_path='/content/drive/My Drive/genai_sample_data/'
process_folder(folder_path)