Description Problem
Only the GCS fetch script benefits from CSV file initialization. The arXiv script shouldn't have copied this pattern. It should be removed:
def initialize_data_file (file_path , headers ):
"""Initialize CSV file with headers if it doesn't exist."""
if not os .path .isfile (file_path ):
with open (file_path , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = headers , dialect = "unix"
)
writer .writeheader ()
The various fetch scripts duplicate a lot of code between them when they save their data:
def rows_to_csv (args , fieldnames , rows , file_path ):
if not args .enable_save :
return args
with open (file_path , "w" , encoding = "utf-8" , newline = "\n " ) as file_handle :
writer = csv .DictWriter (
file_handle , fieldnames = fieldnames , dialect = "unix"
)
writer .writeheader ()
for row in rows :
writer .writerow (row )
with open (file_path , "a" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = fieldnames , dialect = "unix"
)
writer .writerow (row )
with open (FILE1_COUNT , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = HEADER1_COUNT , dialect = "unix"
)
writer .writeheader ()
for row in tool_data :
writer .writerow (row )
def write_data (args , data ):
if not args .enable_save :
return
os .makedirs (PATHS ["data_phase" ], exist_ok = True )
with open (FILE_PATH , "w" , encoding = "utf-8" , newline = "" ) as file_obj :
writer = csv .DictWriter (
file_obj ,
fieldnames = OPENVERSE_FIELDS ,
dialect = "unix" ,
)
writer .writeheader ()
for row in data :
writer .writerow (row )
def write_data (args , data_metrics , data_units ):
if not args .enable_save :
return args
# Create data directory for this phase
os .makedirs (PATHS ["data_phase" ], exist_ok = True )
with open (FILE_1_METRICS , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = HEADER_1_METRICS , dialect = "unix"
)
writer .writeheader ()
for row in data_metrics :
writer .writerow (row )
with open (FILE_2_UNITS , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = HEADER_2_UNITS , dialect = "unix"
)
writer .writeheader ()
for row in data_units :
writer .writerow (row )
return args
def write_data (args , tool_data ):
if not args .enable_save :
return args
LOGGER .info ("Saving fetched data" )
os .makedirs (PATHS ["data_phase" ], exist_ok = True )
with open (FILE_LANGUAGES , "w" , encoding = "utf-8" , newline = "\n " ) as file_obj :
writer = csv .DictWriter (
file_obj , fieldnames = HEADER_LANGUAGES , dialect = "unix"
)
writer .writeheader ()
for row in tool_data :
writer .writerow (row )
return args
Description
Add rows_to_csv() function to shared library (shared.py)
New function should check args.enable_save
New function should "Create data directory for this phase"
New function shoudn't return args
None of the curernt functions that return args modify args--there's no reason to return it
GCS fetch script only rights a single row, but it can send a list with a single row
Update fetch scripts to use new function
Test fetch scripts to verify they behave as intended
Rename data_to_csv() function to dataframe_to_csv()
Update process scripts to use new name
Additional context
Reactions are currently unavailable
You can’t perform that action at this time.
Problem
quantifying/scripts/1-fetch/arxiv_fetch.py
Lines 128 to 135 in 34c1caa
quantifying/scripts/1-fetch/arxiv_fetch.py
Lines 475 to 485 in 34c1caa
quantifying/scripts/1-fetch/gcs_fetch.py
Lines 181 to 185 in 34c1caa
quantifying/scripts/1-fetch/github_fetch.py
Lines 92 to 98 in 34c1caa
quantifying/scripts/1-fetch/openverse_fetch.py
Lines 195 to 207 in 34c1caa
quantifying/scripts/1-fetch/smithsonian_fetch.py
Lines 98 to 121 in 34c1caa
quantifying/scripts/1-fetch/wikipedia_fetch.py
Lines 78 to 91 in 34c1caa
Description
rows_to_csv()function to shared library (shared.py)args.enable_savereturn argsargsmodifyargs--there's no reason to return itdata_to_csv()function todataframe_to_csv()Additional context