-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapText
More file actions
230 lines (174 loc) · 6.7 KB
/
scrapText
File metadata and controls
230 lines (174 loc) · 6.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
============
import json
import re
from pathlib import Path
import argparse
def extract_valid_json(text):
"""
Extract valid JSON from text.
Steps:
1. Extract JSON inside fenced blocks
2. If none found, try parsing entire text as JSON.
3. If that fails, try parsing from first line starting with { or [ till the end.
Returns parsed JSON object (dict or list), or empty dict if none valid.
"""
# 1. Extract fenced JSON blocks
fenced_blocks = []
fenced_blocks += re.findall(r"'''json\s*([\s\S]*?)'''", text, re.IGNORECASE)
fenced_blocks += re.findall(r'"""json\s*([\s\S]*?)"""', text, re.IGNORECASE)
fenced_blocks += re.findall(r'```json\s*([\s\S]*?)```', text, re.IGNORECASE)
candidates = fenced_blocks if fenced_blocks else [text]
for block in candidates:
block = block.strip()
try:
parsed = json.loads(block)
if isinstance(parsed, (dict, list)):
return parsed
except json.JSONDecodeError:
continue
# 2. Try parsing entire text
try:
parsed = json.loads(text)
if isinstance(parsed, (dict, list)):
return parsed
except json.JSONDecodeError:
pass
# 3. Try parsing from first line that looks like JSON start
lines = text.splitlines()
json_start_index = None
for i, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith('{') or stripped.startswith('['):
json_start_index = i
break
if json_start_index is not None:
json_candidate = '\n'.join(lines[json_start_index:])
try:
parsed = json.loads(json_candidate)
if isinstance(parsed, (dict, list)):
return parsed
except json.JSONDecodeError:
pass
# If no valid JSON found
return {}
def clean_json_files(input_folder, output_folder):
input_path = Path(input_folder)
output_path = Path(output_folder)
output_path.mkdir(parents=True, exist_ok=True)
for file_path in input_path.glob("*.json"):
with open(file_path, 'r', encoding='utf-8') as f:
raw_text = f.read()
cleaned_json = extract_valid_json(raw_text)
output_file = output_path / file_path.name
with open(output_file, 'w', encoding='utf-8') as out:
json.dump(cleaned_json, out, indent=2)
if cleaned_json:
print(f"✅ Cleaned: {file_path.name}")
else:
print(f"⚠️ No valid JSON found in {file_path.name}, wrote empty object.")
==========
import json
import os
def convert_currency_fields(json_data, target_keys, get_exchange_rate):
"""
Recursively convert target key values in any nested JSON structure
based on currency exchange rates, if currency is not USD.
"""
def recursive_convert(obj, current_currency=None):
if isinstance(obj, dict):
currency_key = next((k for k in obj if k.lower() == "currency"), None)
if currency_key:
current_currency = obj[currency_key]
if current_currency and current_currency.upper() != "USD":
try:
rate = get_exchange_rate(current_currency)
for key in target_keys:
if key in obj and isinstance(obj[key], (int, float)):
obj[key] = round(obj[key] * rate, 2)
if currency_key:
obj[currency_key] = "USD"
current_currency = "USD"
except Exception as e:
print(f"Warning: Could not convert currency '{current_currency}': {e}")
for k, v in obj.items():
obj[k] = recursive_convert(v, current_currency)
elif isinstance(obj, list):
return [recursive_convert(item, current_currency) for item in obj]
return obj
return recursive_convert(json_data)
def get_exchange_rate(currency):
"""
Return exchange rate of given currency to USD.
"""
rates = {
"EUR": 1.1,
"GBP": 1.3,
"CHF": 1.12,
"SEK": 0.092,
"NOK": 0.095,
"DKK": 0.15,
"CZK": 0.044,
"HUF": 0.0028,
"PLN": 0.25,
"RON": 0.22,
"HRK": 0.14,
"ISK": 0.0073,
"RSD": 0.0091,
"BGN": 0.56,
"TRY": 0.031,
"UAH": 0.025,
"MKD": 0.017,
}
return rates.get(currency.upper(), 1.0) # fallback to 1.0
def process_json_file(input_path, save_path, output_filename, target_keys):
"""
Load a JSON file, convert currency fields, and save to a new file.
Parameters:
input_path (str): Full path to the input JSON file.
save_path (str): Directory to save the output file.
output_filename (str): Filename for the saved result.
target_keys (list): Keys to convert (e.g. ['fee', 'tax']).
"""
with open(input_path, 'r', encoding='utf-8') as f:
data = json.load(f)
converted_data = convert_currency_fields(data, target_keys, get_exchange_rate)
# Ensure the output directory exists
os.makedirs(save_path, exist_ok=True)
full_output_path = os.path.join(save_path, output_filename)
with open(full_output_path, 'w', encoding='utf-8') as f:
json.dump(converted_data, f, indent=2)
print(f"Converted JSON saved to: {full_output_path}")
===============
import json
import os
from pathlib import Path
def combine_json_files(file_list, input_folder="."):
"""
Loads multiple JSON files with UTF-8 encoding and combines them into:
{
"filename1": { ...content... },
"filename2": { ...content... }
}
Args:
file_list (list): List of JSON filenames (e.g. ['file1.json', 'file2.json'])
input_folder (str): Path to folder containing files (default: current directory)
Returns:
dict: Combined structure with filename keys
"""
result = {}
for filename in file_list:
file_path = Path(input_folder) / filename
try:
# Load with explicit UTF-8 encoding
with open(file_path, 'r', encoding='utf-8') as f:
key = Path(filename).stem # Get filename without extension
result[key] = json.load(f)
except FileNotFoundError:
print(f"⚠️ File not found: {file_path}")
except json.JSONDecodeError:
print(f"⚠️ Invalid JSON in {filename}")
except UnicodeDecodeError:
print(f"⚠️ Failed to decode {filename} as UTF-8")
except Exception as e:
print(f"⚠️ Unexpected error with {filename}: {str(e)}")
return result