-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtab_to_psql.py
More file actions
executable file
·202 lines (185 loc) · 8.57 KB
/
tab_to_psql.py
File metadata and controls
executable file
·202 lines (185 loc) · 8.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python
# encoding: utf-8
"""
Temp script to convert a .tab doc to a .sql doc so we can use it more places.
"""
#### Python imports
import sys
import codecs
import psycopg2
import psycopg2.extras
import psycopg2.extensions
reload (sys)
sys.setdefaultencoding ('utf-8') # lets us pipe the output straight to a file and not pike on utf/ascii conversion
tabfile = sys.argv[1]
fin = open( tabfile, "rU") # codecs open doesn't respect \r newlines
fout = codecs.open( "/tmp/out1.sql", "wt", "utf-8" )
#establish a connection with db
try:
conn = psycopg2.connect("dbname='%s' user='%s' host='%s'" % ('cnr','nschroeder','colbert.walkerart.org'))
#print conn.encoding
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
except Exception as e:
print "Database connection error"
print e
exit()
ACC_COL = 35
OBJECTID_COL = 38
TITLE_COL = 41
FIRSTNAME_COL=78
LASTNAME_COL=69
sql = u''
wac_obj_placeholders = ''
size = 0
limit = 10000000
count = 0
objectid = ''
full_col_list = []
wac_object_col_list = []
related_tables = {'wac_condition':[
'Condition',
'ConditionDate'
],
'wac_artist':[
'Artist',
'Sex',
'birthdate',
'deathdate',
'Born',
'Died',
'FirstName',
'LastName',
'PlaceofBirth',
'preferredlabel',
'role',
'ULANID',
'Nationality',
'ulan_ulan_nationality',
],
'wac_subject':[
'IAIASubject',
],
'wac_valuation':[
'ValuationDate',
'Valuationsource',
'CurrentValue'
],
'wac_dimensions':[
'Width',
'Height',
'Depth',
'Weight',
'DimDescription',
'Dimensions',
],
}
create_tables = {}
create_data = {}
for line in fin:
count+=1
if count < limit:
#line = unicode( line, "utf-8" )
cols = line.split('\t')
cols[-1:] = [cols[-1:][0].strip()]
sys.stderr.write("line {}\n".format(str(count)))
if count == 1:
col_list = []
# first row is the field names, spit it out
sql = u""
for col in cols:
col = col.replace(' ','').replace('.','').replace(':','').replace('/','').replace('#','').replace("'",'')
col = col.replace('nationality','ulan_nationality') # bug in wacart I can't correct till Joe's done
if col == 'Date':
col = 'date_wac'
col = col.replace('PrinterøsMarks','PrintersMarks') # bug in wacart I can't correct till Joe's done
full_col_list.append("{}".format(col))
wac_object = True
for key, val in related_tables.items():
if col in val:
wac_object = False
if key not in create_tables:
create_tables[key] = ['wac_object_id']
if key == 'wac_artist':
create_tables[key].append('web_id')
create_tables[key].append(col)
#sys.stderr.write("SKIP %s\n" % (col))
# store these in order, I reckon, so we can create the table and store the pieces
if wac_object:
if 'wac_object' not in create_tables:
create_tables['wac_object'] = ['id','web_id']
create_tables['wac_object'].append(col)
wac_object_col_list.append("{}".format(col))
# generate the sql
for tablename,tablecols in create_tables.items():
sql += u"""DROP TABLE if exists {};
CREATE TABLE {} (\n""".format(tablename,tablename)
for tablecol in tablecols:
sql += u" {} text,\n".format(tablecol)
sql = sql[:-2] # remove trailing comma
sql += u"""
);
ALTER TABLE public.{} OWNER TO cnr;\n""".format(tablename)
# size = len(col_list)
else:
# populate the hash to write out at the end
if cols[OBJECTID_COL] or cols[ACC_COL] or cols[TITLE_COL]:
objectid = cols[OBJECTID_COL]
accno = cols[ACC_COL] # store outside the loop
if not objectid and cols[ACC_COL]:
#sys.stderr.write("=========== WTF {}\n".format(accno))
# recover using accession number (remove trailing .x in .1-.15)
slim_accno = cols[ACC_COL][0:cols[ACC_COL].find('-')]+'-%' if '-' in cols[ACC_COL] else cols[ACC_COL]
if slim_accno == cols[ACC_COL] and cols[ACC_COL].count('.')>1:
slim_accno = cols[ACC_COL][0:cols[ACC_COL].rfind('.')]+'.%'
#sys.stderr.write("=========== WTF {}\n".format(slim_accno))
cur.execute("select cms_id from object where accession_number like %s",(slim_accno,))
row = cur.fetchone()
if row and row['cms_id']:
objectid = str(row['cms_id'])
#sys.stderr.write("=========== WTF {}\n".format(objectid))
if not objectid:
# um. What the heck kind of object IS this?
objectid = cols[TITLE_COL].strip().replace(' ','')
sys.stderr.write("=========== WTF {}\n".format(accno))
#sys.stderr.write("{}\n".format(objectid))
data_cols = {}
for i in range(len(cols)):
for key, val in create_tables.items():
if full_col_list[i] in val:
#sys.stderr.write("{} goes in {}\n".format(full_col_list[i], key))
if key not in data_cols:
data_cols[key] = [objectid]
if key == 'wac_object':
wacid=''
if cols[OBJECTID_COL]:
cur.execute("select id from object where cms_id=%s",(cols[OBJECTID_COL],))
row = cur.fetchone()
if row and row['id']:
wacid = str(row['id'])
data_cols[key].append(wacid)
if key == 'wac_artist':
wacid=''
if cols[LASTNAME_COL]:
#sys.stderr.write("{} {}\n".format(cols[FIRSTNAME_COL],cols[LASTNAME_COL]))
#cur.execute("select agent_id from names where upper(first_name) like upper(%s) and upper(index_name) like (%s)",(cols[FIRSTNAME_COL],cols[LASTNAME_COL]))
cur.execute("select id from object where cms_id=%s",(objectid,))
row = cur.fetchone()
if row and row['id']:
wacid = str(row['id'])
data_cols[key].append(wacid)
data_cols[key].append(cols[i])
for key, val in data_cols.items():
has_data = False
for testcol in val[1:]:
if testcol:
has_data = True
if has_data:
if key not in create_data:
create_data[key] = ''
create_data[key] += u"\t".join(val)+"\n"
for key, val in create_tables.items():
sql += u"COPY {} ({}) FROM stdin;\n".format(key,','.join(val))
sql += create_data[key].replace('\x0b','\\r') #replace vertical tabs with carriage return marker
sql += "\\.\n\n"
if sql:
fout.write(sql)